In [None]:
import pandas as pd
import os

# Step 1: Read the Excel file
# name = 'Data_INC'
name = 'Data_INC' # problem, problem_task, CRE_INC

file_path = os.path.join(raw_data, f"{name}.xlsx")
df = pd.read_excel(file_path)
df= df.drop(columns="Affected User")

# Step 2: Determine split size
# Assuming each row is approximately 1 KB, calculate rows per file for 100 MB limit
approx_row_size_kb = 1  # Adjust based on your data
rows_per_file = (24 * 1024) // approx_row_size_kb

# Step 3: Split the DataFrame
num_files = (len(df) // rows_per_file) + 1
print(num_files)

# Create a directory to store smaller files
output_dir = f'{name}_split_files'
os.makedirs(output_dir, exist_ok=True)

# Step 4: Save smaller files in Parquet format
for i in range(num_files):
    start_row = i * rows_per_file
    end_row = min((i + 1) * rows_per_file, len(df))
    df_split = df.iloc[start_row:end_row]
    output_file = os.path.join(output_dir, f'{name}_split_file_{i+1}.parquet')
    try:
        df_split.to_parquet(output_file, engine='fastparquet', compression='gzip')
    except Exception as e:
        print(f"Error saving file {output_file}: {e}")

print(f'Successfully split into {num_files} files.')

# # Step 5: Merge files (when needed)
# # Example of merging back the files
# merged_df = pd.DataFrame()
# for i in range(num_files):
#     input_file = os.path.join(output_dir, f'{}split_file_{i+1}.parquet')
#     df_part = pd.read_parquet(input_file)
#     merged_df = pd.concat([merged_df, df_part], ignore_index=True)

# print('Successfully merged files back into a single DataFrame.')

In [None]:
# ============================================================
# 1. CARGAR DICCIONARIO DE MAPEOS (Equipo -> Grupo)
# ============================================================

df_grupos_1lvl = pd.read_excel(
    os.path.join(raw_data, "grupos_audit.xlsx"), sheet_name="1LVL", dtype=str
)
df_grupos_IT = pd.read_excel(
    os.path.join(raw_data, "grupos_audit.xlsx"), sheet_name="IT_TEAMS", dtype=str
)

df_grupos = pd.concat([df_grupos_1lvl, df_grupos_IT], ignore_index=True)
df_grupos[["Equipo", "Grupo"]] = df_grupos[["Equipo", "Grupo"]].map(clean_str_compact)

map_dict = df_grupos.set_index("Equipo")["Grupo"].to_dict()

# ============================================================
# 2. CARGAR Y PREPARAR INCIDENCIAS
# ============================================================

load = True
try:
    log_separator("Data_INC.xlsx Source -> ServiceNow", level=2)
    logger.info("Loading and processing Data_INC.xlsx")

    # Ruta al pickle y fallback a Excel si no existe
    pickle_path = os.path.join(raw_data, "init_Data_INC.pkl")
    if os.path.exists(pickle_path) and load:
        logger.info(f"Loading Data_INC from pickle: {pickle_path}")
        df = pd.read_pickle(pickle_path)
    else:
        excel_path = os.path.join(raw_data, "Data_INC.xlsx")
        logger.info(f"Loading Data_INC from Excel: {excel_path}")
        df = pd.read_excel(excel_path)
        df.to_pickle(pickle_path)
except Exception as e:
    logger.error(f"Error loading Data_INC: {e}")

df = pd.read_pickle(os.path.join(raw_data, "init_Data_INC.pkl"))
df_origen = df.copy()


def twk_df(df):
    return (
        df.pipe(clean_headers)
        .pipe(
            set_df_column_order,
            [
                "NUMBER",
                "PROBLEM",
                "STATE",
                "LAST_ASSIGNMENT_GROUP",
                "ASSIGNMENT_GROUP",
                "CREATOR_GROUP",
            ],
        )
        .loc[
            lambda x: (x['CREATED'].dt.year > 2022)
            # & (x['CREATED'] < pd.to_datetime('2025-09-01'))
        ]
        .pipe(sort_values_of_incidents)
    )

print(df.shape, "lectura")
df = twk_df(df)
print(df.shape , "filtros previos")


# Metadatos por grupo (agrupado por NUMBER)
df["_ORDER"] = df.groupby("NUMBER").cumcount()
df["_SIZE"] = df.groupby("NUMBER")["NUMBER"].transform("size")
df["_LAST"] = df["_ORDER"] == (df["_SIZE"] - 1)

# ============================================================
# 3. LIMPIAR Y MAPEAR COLUMNAS DE SERVICENOW
# ============================================================

group_cols = ["LAST_ASSIGNMENT_GROUP", "ASSIGNMENT_GROUP", "CREATOR_GROUP"]
group_in_dict = ["_Grupo_LAG", "_Grupo_ASG", "_Grupo_CRG"]

# Limpieza de strings
df[group_cols] = df[group_cols].map(clean_str_compact)
casos_audit["INC_Company_null"] = df[df['COMPANY'].isnull()]
casos_audit_descrip["INC_Company_null"] = "Casos sin empresa 'vacio'"

df['COMPANY'] = df['COMPANY'].fillna("_VACIO_").apply(standardize_company_name)

to_AZS = ["ALLIANZSEGUROS SA", "ALLIANZ SEGUROS SA", "ALLIANZ COMPANIA DE SEGUROS Y REASEGUROS SA"]
df['COMPANY'] = df['COMPANY'].replace(to_AZS, 'ALLIANZ SEGUROS SA')
keep = ["ALLIANZ SEGUROS SA","ALLIANZ SPAIN EXTERNAL","ALLIANZ TECHNOLOGY SL","BBVA"]

# Mapear con fallback ("" si no existe en el dict)
df[group_in_dict] = df[group_cols].map(lambda x: map_dict.get(x, ""))

# Flags de grupo
df["_Grupo_RES_any"] = (df[group_in_dict] != "").any(axis=1)
df["_Grupo_RES_all"] = (df[group_in_dict] != "").all(axis=1)

df["_Grupo_No1LVL_any"] = (df[group_in_dict] != "1LVL-OTROS").any(axis=1)
df["_Grupo_No1LVL_all"] = (df[group_in_dict] != "1LVL-OTROS").all(axis=1)

df["_Grupo_GLOB_any"] = (df[group_in_dict] == "GLOB-OTROS").any(axis=1)
df["_Grupo_GLOB_all"] = (df[group_in_dict] == "GLOB-OTROS").all(axis=1)

# ============================================================
# 4. REGLAS DE ÚLTIMO VALOR
# ============================================================

try:
    # A) Últimos valores de grupos
    last_values = df.groupby("NUMBER")[group_cols].last()
    last_mapped = last_values.map(lambda x: x in map_dict).add_prefix("_IS_Last_")
    df = df.merge(last_mapped, on="NUMBER", how="left")

    # B) Último estado/causa
    last_values = df.groupby("NUMBER")[["STATE", "CAUSE_CODE"]].last()
    df["_LAST_STATE_CLOSED"] = df["NUMBER"].map(last_values["STATE"] != "CLOSED")
    df["_LAST_CAUSE_CODE_WaD"] = df["NUMBER"].map(
        last_values["CAUSE_CODE"] != "Works as designed"
    )
    df["_LAST_CAUSE_CODE_Cancelled"] = df["NUMBER"].map(
        last_values["CAUSE_CODE"] != "Cancelled"
    )
    df["_WaD_Closed"] = df["_LAST_STATE_CLOSED"] & df["_LAST_CAUSE_CODE_WaD"]
    df["_Cancelled_Closed"] = (
        df["_LAST_STATE_CLOSED"] & df["_LAST_CAUSE_CODE_Cancelled"]
    )

    # C) Últimos GLOB/Prioridad
    last_values = df.groupby("NUMBER")[["_Grupo_GLOB_all", "CURRENT_PRIORITY"]].last()
    mask_excluir = last_values["_Grupo_GLOB_all"] & last_values[
        "CURRENT_PRIORITY"
    ].isin(["P4", "P3"])
    df["_Glob_P4"] = df["NUMBER"].map(~mask_excluir)

    # D)
    # last_values = df.groupby("NUMBER")[["STATE", "CAUSE_CODE"]].last()
    # df["_LAST_STATE_CLOSED"] = df["NUMBER"].map(last_values["STATE"] != "CLOSED")
    # df["_LAST_CAUSE_CODE_WaD"] = df["NUMBER"].map(
    #     last_values["CAUSE_CODE"] != "Works as designed"º
    # )
    # df["_WaD_Closed"] = df["_LAST_STATE_CLOSED"] & df["_LAST_CAUSE_CODE_WaD"]

except Exception as e:
    print(f"Error occurred: {e}")

# Flags agregados de último
prefix = "_IS_Last_"
df["_Grupo_LAST_any"] = df.filter(like=prefix).any(axis=1)
df["_Grupo_LAST_all"] = df.filter(like=prefix).all(axis=1)

# ============================================================
# 5. FILTROS SECUENCIALES
# ============================================================

# Define masks based on conditions
df['_IS_Parent'] = df['PARENT_INCIDENT'].fillna('') == '' # remove childs
df['_IS_FCR'] = df['FCR'] != True
df['_IS_COMPANY'] = df['COMPANY'].isin(keep)

filters = [
    "_Grupo_RES_all",
    "_Grupo_No1LVL_all",
    "_WaD_Closed",
    "_Cancelled_Closed",
    "_Glob_P4",
    "_IS_Parent",
    "_IS_FCR",
    "_IS_COMPANY"
]
df = apply_filters(df, filters)

# ============================================================
# 6. ORDEN FINAL
# ============================================================

df_incidents = df.pipe(sort_values_of_incidents, "_SIZE")

try:
    del df, df_grupos, df_grupos_IT, df_grupos_1lvl
except Exception as e:
    print(f"Error occurred: {e}")

incident_tickets = df_incidents['NUMBER'].unique()
problem_tickets = df_incidents["PROBLEM"].unique()

# cancelled