In [105]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from boruta import BorutaPy
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from math import radians, sin, cos, sqrt, atan2
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder
import joblib

In [106]:
df_train = pd.read_csv('./datasets/input/fraudTrain.csv')
df_test = pd.read_csv('./datasets/input/fraudTest.csv')

In [107]:
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Utils

In [108]:
import os
import pandas as pd
from datetime import datetime
from math import radians, sin, cos, sqrt, atan2
import zipfile
from catboost import CatBoostClassifier
import streamlit as st
from sklearn.metrics import accuracy_score, classification_report
import tempfile

def fraud_pct_by_column(data, column_name, target_name, fraud_pct_col_name, rank_col_name):

    group_fraud_by_column = data.groupby(column_name).agg(
      total_sales=(target_name, 'count'),
      fraud_sales=(target_name, 'sum')
    )

    # Calcular el porcentaje de fraude para cada valor
    group_fraud_by_column[fraud_pct_col_name] = (group_fraud_by_column['fraud_sales'] / group_fraud_by_column['total_sales']) * 100
    group_fraud_by_column = group_fraud_by_column.reset_index()

    # Rank de porcentaje de fraude
    group_fraud_by_column[rank_col_name] = group_fraud_by_column[fraud_pct_col_name].rank(ascending=False)
    # Unirlo con el df original
    data = data.merge(group_fraud_by_column[[column_name, fraud_pct_col_name, rank_col_name, 'fraud_sales']], on=column_name, how='left')

    return data, group_fraud_by_column[[column_name, fraud_pct_col_name, rank_col_name, 'fraud_sales']]

# Se agruparan las profesiones para disminuir la dimensionalidad
def assign_sector(x):
    group_jobs = {
        "Engineering and Technology": ["engineer", "developer", "programmer", "technician", "architect", "systems", 
                                    "network", "administrator", "data scientist", "cybersecurity", "web developer", 
                                    "analyst", "database", "devops", "maintenance", "manufacturing", "site", 
                                    "structural", "materials", "biomedical", "environmental", "telecommunications"],
        
        "Healthcare and Medicine": ["doctor", "nurse", "therapist", "pharmacist", "health", "surgeon", "dentist", 
                                    "clinician", "physician", "optometrist", "radiologist", "paramedic", "midwife", 
                                    "veterinarian", "psychiatrist", "psychologist", "radiographer", "biochemist", 
                                    "cytogeneticist", "audiologist", "pathologist"],
        
        "Education and Training": ["teacher", "professor", "educator", "trainer", "lecturer", "scientist", "tutor", 
                                "principal", "instructor", "counselor", "academic", "researcher", "dean", 
                                "headmaster", "careers adviser", "museum education officer", "education administrator"],
        
        "Science and Environment": ["scientist", "environmental consultant", "ecologist", "geologist", "hydrologist", 
                                    "conservation officer", "horticulturist", "geophysicist", "soil scientist", 
                                    "agricultural consultant", "agricultural engineer", "oceanographer", 
                                    "fisheries officer"],
        
        "Art, Design, and Media": ["designer", "artist", "animator", "photographer", "film editor", "video editor", 
                                "television producer", "film producer", "radio producer", "curator"],
        
        "Finance": ["analyst", "accountant", "auditor", "banker", "financial", "investment", "controller", "broker", 
                    "consultant", "treasurer", "loan officer", "trader", "actuary", "economist", "portfolio", "credit"],
        
        "Marketing": ["manager", "executive", "specialist", "consultant", "advertising", "public relations", "strategist", 
                    "director", "coordinator", "brand", "SEO", "content", "digital", "market research", "social media", 
                    "copywriter"],
        
        "Manufacturing": ["operator", "mechanic", "assembler", "fabricator", "engineer", "technician", "welder", 
                        "planner", "quality", "machinist", "production", "inspector", "supervisor", "foreman", 
                        "toolmaker", "CNC"],
        
        "Retail": ["cashier", "salesperson", "store", "associate", "manager", "clerk", "shopkeeper", "merchandiser", 
                "assistant", "retail", "customer service", "sales", "inventory", "buyer", "stocker", "checkout"],
        
        "Legal": ["lawyer", "attorney", "paralegal", "judge", "legal", "solicitor", "notary", "clerk", "litigator", 
                "advocate", "barrister", "counsel", "magistrate", "prosecutor", "defense", "compliance"],
        
        "Hospitality": ["chef", "waiter", "bartender", "host", "manager", "receptionist", "housekeeper", "concierge", 
                        "caterer", "cook", "hotel", "tour guide", "event planner", "sous chef", "sommelier", "valet"],
        
        "Construction": ["builder", "carpenter", "electrician", "plumber", "architect", "project manager", "site manager", 
                        "surveyor", "foreman", "bricklayer", "roofer", "civil engineer", "construction", "contractor", 
                        "inspector", "draftsman"]
    }
    for key in group_jobs:
        for role in group_jobs[key]:
            if x.find(role) != -1:
                return key
    return "Other"


def datetime_split(data, datatime_col_name, day_col_name, month_col_name, year_col_name, hour_col_name, weekday_col_name):
    # Transformar la columna a datetime para el procesado
    data[datatime_col_name] = pd.to_datetime(data[datatime_col_name])

    # Dividir el día del mes, mes, año, hora y día de la semana en nuevas columnas
    data[day_col_name] = data[datatime_col_name].dt.day
    data[month_col_name] = data[datatime_col_name].dt.month
    data[year_col_name] = data[datatime_col_name].dt.year
    data[hour_col_name] = data[datatime_col_name].dt.hour
    data[weekday_col_name] = data[datatime_col_name].dt.weekday

    return data

def dob_to_age(data, dob_col_name, age_col_name):
    # Transformar la columna dob de object a datetime
    data[dob_col_name] = pd.to_datetime(data[dob_col_name])

    # Obtener la fecha actual
    actual_date = pd.to_datetime(datetime.now().date())

    # Convertir la fecha de nacimiento a edad [años] (fecha actual - fecha de nacimiento)/días promedio por año
    data[age_col_name] = ((actual_date - data[dob_col_name]).dt.days / 365.25).astype(int)

    return data

# Función para calcular la distancia
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radio de la tierra [km]
    lat1_rad, lon1_rad = radians(lat1), radians(lon1)
    lat2_rad, lon2_rad = radians(lat2), radians(lon2)
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

def extract_zip_to_csv(uploaded_file, temp_dir):
    """
    Extrae un archivo zip, busca un archivo CSV dentro y lo convierte a csv.
    """
    zip_path = os.path.join(temp_dir, "temp.zip")

    # Guardar el archivo zip subido en el directorio temporal
    with open(zip_path, "wb") as f:
        f.write(uploaded_file.getvalue())

    # Descomprimir el archivo zip
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(temp_dir)

    # Buscar archivos CSV en el directorio temporal
    extracted_files = [f for f in os.listdir(temp_dir) if f.endswith('.csv')]

    return extracted_files


def catboost_model(features_scaled, target, model):
    predictions = model.predict(features_scaled)                   # Hacer predicciones

    # Mostrar las predicciones
    st.write("Mostrando las primeras 5 predicciones:")
    st.dataframe(pd.DataFrame(predictions, columns=['Predicción']).head())

    # Mostrar el objetivo real
    st.write("Mostrando las primeras 5 reales:")
    st.dataframe(target.head())

    # Evaluar el modelo
    accuracy = accuracy_score(target, predictions)
    report = classification_report(target, predictions)
    return predictions, accuracy, report

def extract_zip_to_model(zip_file, name_model):
    try:
        # Extraer el archivo .cbm dentro de un directorio temporal
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            with tempfile.TemporaryDirectory() as tmpdirname:
                # Filtrar y extraer solo el archivo de modelo necesario
                extracted_files = []
                for file_name in zip_ref.namelist():
                    if file_name.endswith(name_model):
                        zip_ref.extract(file_name, tmpdirname)
                        extracted_file_path = os.path.join(tmpdirname, file_name)
                        extracted_files.append(extracted_file_path)
                        break
                else:
                    raise FileNotFoundError(f'{name_model} no encontrado en el archivo ZIP.')

                # Imprimir la ruta del archivo extraído y los archivos en el directorio temporal
                print(f"Archivo extraído en: {extracted_files[0]}")
                print(f"Archivos en el directorio temporal: {os.listdir(tmpdirname)}")

                # Cargar el modelo CatBoost
                model = CatBoostClassifier()
                model.load_model(extracted_files[0])
                return model

    except Exception as e:
        print(f"Error al extraer o cargar el modelo: {e}")
        return None

# Preprocessing function

In [109]:
def preprocessing_data(data, create_data=False):
   # Añadir columnas de porcentaje de fraude y ranking para: vendedor, ciudad y estado
   data, group_fraud_by_merch = fraud_pct_by_column(data, 'merchant', 'is_fraud', 'fraud_merch_pct', 'fraud_merch_rank')
   data, group_fraud_by_city = fraud_pct_by_column(data, 'city', 'is_fraud', 'fraud_city_pct', 'fraud_city_rank')
   data, group_fraud_by_state = fraud_pct_by_column(data, 'state', 'is_fraud', 'fraud_state_pct', 'fraud_state_rank')

   # Aplicar la función de reemplazo de profesiones por sector (Para visualización) y realizar un encoded por la frecuencia (para el modelo)
   data['job_sector'] = data['job'].apply(assign_sector)
   job_freq = data['job'].value_counts(normalize=True)

   # Mapear el encoded
   data['job_encoded'] = data['job'].map(job_freq)

   # Dividir la columna de la fecha/hora en columnas separadas para: día del mes, mes, año, hora, día de la semana. 
   data = datetime_split(data, 'trans_date_trans_time', 'trans_day', 'trans_month', 'trans_year', 'trans_hour', 'trans_weekday')

   # Transformar la fecha de nacimiento en edad
   data = dob_to_age(data, 'dob', 'age')

   # Crear una nueva columna con la distancia entre el vendedor y el comprador
   data["distance_to_merch"] = data.apply(lambda row: haversine_distance(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

   # Inicializar el codificador OneHotEncoder
   encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

   # Ajustar el codificador con los datos de entrenamiento
   data_ohe = encoder.fit_transform(data[['category', 'gender']])

   # Reconstruir el dataframe
   col_names = ['category_food_dining', 'category_gas_transport',
      'category_grocery_net', 'category_grocery_pos',
      'category_health_fitness', 'category_home', 'category_kids_pets',
      'category_misc_net', 'category_misc_pos', 'category_personal_care',
      'category_shopping_net', 'category_shopping_pos', 'category_travel',
      'gender_M']
   data_ohe = pd.DataFrame(data_ohe, columns=col_names)
   data_ohe = pd.concat([data, data_ohe], axis=1)
   data_all_rows = data_ohe.copy()

   # Eliminar columnas redundantes o con poca información para el modelo
   data_ohe.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'unix_time', 'trans_num',
                           'merchant', 'city', 'state', 'job', 'job_sector', 'trans_date_trans_time', 
                           'dob', 'lat', 'long', 'merch_lat', 'merch_long', 'category', 'gender', 'fraud_sales'], axis=1, inplace=True)

   # Guardar los datos para el procesado de nuevos dataframes
   if create_data:
      group_fraud_by_merch.to_csv('streamlit_app/group_fraud_by_merch_test.csv')
      group_fraud_by_city.to_csv('streamlit_app/group_fraud_by_city_test.csv')
      group_fraud_by_state.to_csv('streamlit_app/group_fraud_by_state_test.csv')
      job_freq.to_csv('streamlit_app/job_freq.csv', index=True)
      joblib.dump(encoder, './streamlit_app/onehotencoder.pkl')

   return data_ohe, data_all_rows

In [110]:
data_clean, data_all_rows = preprocessing_data(df_combined, create_data=True)

In [None]:
.

In [131]:
top_5_fraud_merch = data_all_rows[['merchant', 'fraud_merch_pct', 'fraud_sales']].sort_values(by='fraud_merch_pct', ascending=False).drop_duplicates().head(10)
top_5_fraud_merch

Unnamed: 0,merchant,fraud_merch_pct,fraud_sales
967050,fraud_Kozey-Boehm,2.175489,324
567035,fraud_Kozey-Boehm,2.175489,278
417190,fraud_Kozey-Boehm,2.175489,273
1770616,fraud_Kozey-Boehm,2.175489,228
72231,fraud_Kozey-Boehm,2.175489,592
485683,fraud_Kozey-Boehm,2.175489,360
1014395,fraud_Kozey-Boehm,2.175489,730
1156727,fraud_Kozey-Boehm,2.175489,334
1542906,fraud_Kozey-Boehm,2.175489,222
348793,fraud_Kozey-Boehm,2.175489,402


In [122]:
top_5_fraud_city = data_all_rows[['city', 'fraud_city_pct', 'fraud_sales']].sort_values(by='fraud_city_pct', ascending=False).drop_duplicates()
top_5_fraud_city

Unnamed: 0,city,fraud_city_pct,fraud_sales
1088912,Las Vegas,100.0,47
701890,Oakton,100.0,273
155723,Coulee Dam,100.0,126
52111,Vacaville,100.0,402
1640154,Guthrie,100.0,200
...,...,...,...
1600717,Clay Center,0.0,360
1364784,Pittsburgh,0.0,572
154786,Lubbock,0.0,592
1625629,Tomahawk,0.0,228


In [123]:
top_5_fraud_city['city'].nunique()

906

In [125]:
top_5_fraud_state = data_all_rows[['state', 'fraud_state_pct', 'fraud_sales']].sort_values(by='fraud_state_pct', ascending=False).drop_duplicates().head()
top_5_fraud_state

Unnamed: 0,state,fraud_state_pct,fraud_sales
236643,DE,100.0,9
1084243,RI,2.013423,15
186302,AK,1.687479,50
1643271,OR,0.745986,197
265159,NH,0.673659,79
121810,VA,0.653798,273
1070650,TN,0.638221,159
216315,NE,0.627451,216
487792,MN,0.616292,280
1834756,NY,0.611293,730


In [127]:
top_5_fraud_state['state'].nunique()

51

In [120]:
# Guardar tops de fraude
top_5_fraud_merch.to_csv('streamlit_app/top_5_fraud_merch.csv', index=False)
top_5_fraud_city.to_csv('streamlit_app/top_5_fraud_city.csv', index=False)
top_5_fraud_state.to_csv('streamlit_app/top_5_fraud_state.csv', index=False)

In [78]:
data_all_rows[['city', 'fraud_state_pct']].sort_values('fraud_state_pct', ascending=False)['city'].unique()

array(['Georgetown', 'Providence', 'Huslia', 'Wales', 'Craig', 'Kaktovik',
       'Scotts Mills', 'Cascade Locks', 'Shedd', 'Westfir',
       'Powell Butte', 'Lowell', 'Athena', 'Stayton', 'Gardiner',
       'Portland', 'Jordan Valley', 'Bay City', 'Newberg', 'Sixes',
       'Eugene', 'Lake Oswego', 'Kent', 'La Grande', 'Conway', 'Jaffrey',
       'North Haverhill', 'Center Tuftonboro', 'Grantham', 'Belmont',
       'Acworth', 'Newton', 'Atlantic', 'Basye', 'Springfield',
       'Edinburg', 'Timberville', 'Burke', 'Catawba', 'Ruckersville',
       'Doe Hill', 'Damascus', 'Falls Church', 'Glade Spring', 'Lebanon',
       'Ford', 'Hopewell', 'Alexandria', 'Hampton', 'Drakes Branch',
       'Arlington', 'Etlan', 'Greenbush', 'Manquin', 'Mineral', 'Norfolk',
       'Oakton', 'Murfreesboro', 'Knoxville', 'Grand Junction', 'Oakland',
       'Bethel Springs', 'Slayden', 'Tiptonville', 'Clarksville',
       'Powell', 'Kingsport', 'Heiskell', 'Old Hickory', 'Apison',
       'Chattanooga', 'Moun

# Dividir los datos en características y objetivo

In [11]:
features_train, features_test, target_train, target_test = train_test_split(data_clean.drop('is_fraud', axis=1), 
                                                                            data_clean['is_fraud'], 
                                                                            test_size = 0.2, 
                                                                            random_state = 42)

# Escalar los datos

In [12]:
# Definir las columnas a escalar
cols_to_scale=['amt', 'zip', 'city_pop', 'fraud_merch_pct', 'fraud_merch_rank', 
                'fraud_city_pct', 'fraud_city_rank', 'fraud_state_pct', 'fraud_state_rank',
                'job_encoded', 'trans_day', 'trans_month', 'trans_year', 'trans_hour', 
                'trans_weekday', 'age', 'distance_to_merch']

# Inicializar el escalador
scaler = StandardScaler()

# Escalar las características de entrenamiento
features_train_scaled = features_train.copy()
features_train_scaled[cols_to_scale] = scaler.fit_transform(features_train_scaled[cols_to_scale])

# Escalar las características de prueba
features_test_scaled = features_test.copy()
features_test_scaled[cols_to_scale] = scaler.transform(features_test_scaled[cols_to_scale])

In [14]:
features_train_scaled.head().T

Unnamed: 0,1273644,601398,999645,1180310,1510522
amt,0.602463,-0.256872,-0.200368,-0.320166,-0.227717
zip,1.5074,1.58186,1.309647,-0.090132,1.036765
city_pop,-0.292407,-0.178167,-0.292898,-0.274686,-0.285519
fraud_merch_pct,1.249838,0.943758,-0.041875,-0.877896,1.515113
fraud_merch_rank,-1.125334,-0.985819,-0.696824,1.505522,-1.229971
fraud_city_pct,-0.171144,-0.229086,-0.02148,-0.055579,-0.138746
fraud_city_rank,1.228817,1.384853,-0.254698,-0.031123,1.047162
fraud_state_pct,0.265315,-0.091685,-0.526741,0.118677,-0.354614
fraud_state_rank,-1.122636,0.379041,1.722647,-0.648422,1.090362
job_encoded,-1.195184,-0.435743,0.840868,-0.687386,-0.430537


In [16]:
# Guardar el escalador para usarlo en el conjunto de prueba
joblib.dump(scaler, 'streamlit_app/scaler.pkl')

['streamlit_app/scaler.pkl']

# Balancear los datos

In [17]:
# Aplicar SMOTE en el conjunto de entrenamiento
smote = SMOTE(sampling_strategy='auto', random_state=42)
features_train_resampled, target_train_resampled = smote.fit_resample(features_train_scaled, target_train)

# Revisar la nueva distribución de clases en el conjunto de entrenamiento balanceado
print("Distribución en y_train_resampled:\n", target_train_resampled.value_counts())

Distribución en y_train_resampled:
 is_fraud
0    1474217
1    1474217
Name: count, dtype: int64


# Regresión logística

In [18]:
.

SyntaxError: invalid syntax (1933637684.py, line 1)

In [11]:
# # Entrenaer el modelo
# model_logistic_reg = LogisticRegression(max_iter=200)
# model_logistic_reg.fit(features_train_resampled.values, target_train_resampled)
# predict_logistic_reg = model_logistic_reg.predict(features_test_scaled.values)

# # Evaluar el modelo
# accuracy = accuracy_score(target_test, predict_logistic_reg)
# report = classification_report(target_test, predict_logistic_reg)
# print(f'Accuracy: {accuracy}')
# print('Classification Report:')
# print(report)

Accuracy: 0.8969955112165602
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95    368526
           1       0.04      0.83      0.08      1953

    accuracy                           0.90    370479
   macro avg       0.52      0.86      0.51    370479
weighted avg       0.99      0.90      0.94    370479

In [None]:
# Definir el modelo
model_catboost = CatBoostClassifier(task_type='GPU', verbose=0, random_state=42)

# Definir el rango de parámetros a buscar
param_grid = {
    'bagging_temperature': [0],
    'boosting_type': ['Plain'],  # Puedes intentar 'Plain' también si es necesario
    'border_count': [60],  # Puedes ajustar según tus datos
    'depth': [8],  # Aumentar si se necesita más capacidad del modelo   16 ES EL MEJOR
    'grow_policy': ['Depthwise'],  # Puedes probar otros si es necesario
    'iterations': [500],  # Ajustar según tus necesidades y tiempo
    'l2_leaf_reg': [7],  # Regularización
    'learning_rate': [0.2],  # Ajustar según el rendimiento del modelo
    'random_strength': [5]  # Ajustar si se necesita más aleatoriedad
}


# Definir la validación cruzada estratificada
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Configurar GridSearchCV
grid_search_catboost = GridSearchCV(estimator=model_catboost, param_grid=param_grid, 
                           cv=stratified_kfold, verbose=2, error_score='raise')

# Ajustar el modelo
grid_search_catboost.fit(features_train_resampled.values, target_train_resampled)

# Obtener el mejor modelo
best_model_catboost = grid_search_catboost.best_estimator_

# Hacer predicciones
predicts_catboost = best_model_catboost.predict(features_test_scaled.values)

# Evaluar el modelo
accuracy = accuracy_score(target_test, predicts_catboost)
report = classification_report(target_test, predicts_catboost)

print(f'Best Parameters: {grid_search_catboost.best_params_}')
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Best Parameters: {'bagging_temperature': 0, 'boosting_type': 'Plain', 'border_count': 60, 'depth': 4, 'grow_policy': 'Depthwise', 'iterations': 500, 'l2_leaf_reg': 7, 'learning_rate': 0.2, 'random_strength': 5}
Accuracy: 0.9987745594217217
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.90      0.87      0.88      1953

    accuracy                           1.00    370479
   macro avg       0.95      0.93      0.94    370479
weighted avg       1.00      1.00      1.00    370479

Best Parameters: {'bagging_temperature': 0, 'boosting_type': 'Plain', 'border_count': 60, 'depth': 4, 'grow_policy': 'Depthwise', 'iterations': 100, 'l2_leaf_reg': 7, 'learning_rate': 0.2, 'random_strength': 5}
Accuracy: 0.9973790687191447
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.70      0.87      0.78      1953

    accuracy                           1.00    370479
   macro avg       0.85      0.94      0.89    370479
weighted avg       1.00      1.00      1.00    370479

Best Parameters: {'bagging_temperature': 0, 'boosting_type': 'Plain', 'border_count': 60, 'depth': 6, 'grow_policy': 'Depthwise', 'iterations': 500, 'l2_leaf_reg': 7, 'learning_rate': 0.2, 'random_strength': 5}
Accuracy: 0.9990957652120633
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368526
           1       0.95      0.88      0.91      1953

    accuracy                           1.00    370479
   macro avg       0.97      0.94      0.96    370479
weighted avg       1.00      1.00      1.00    370479

In [17]:
# Guardar el modelo
best_model_catboost.save_model('streamlit_app/catboost_bestmodel.cbm')