In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [2]:
# Función para manejar la transformación de la columna 'Timestamp'
def transformar_fecha(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df['Year'] = df['Timestamp'].dt.year
    df['Month'] = df['Timestamp'].dt.month
    df['Day'] = df['Timestamp'].dt.day
    df['Hour'] = df['Timestamp'].dt.hour
    df['Minute'] = df['Timestamp'].dt.minute
    return df.drop(columns=['Timestamp'])


In [3]:
# Definir las columnas categóricas y numéricas
categorical_features = ['Account2', 'Receiving Currency', 'Payment Currency', 'Payment Format', 'Account4']
numeric_features = ['Year', 'Month', 'Day', 'Hour', 'Minute']

In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', SimpleImputer(strategy='mean'), numeric_features)  # Imputación para valores numéricos
    ])

In [5]:
# Pipeline completo
pipeline = Pipeline(steps=[
    ('fecha_transform', FunctionTransformer(transformar_fecha)),  # Función personalizada para transformar fechas
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

In [6]:
df = pd.read_csv('/content/drive/My Drive/Thesis/export.csv')

In [7]:
df_positivos = df[df['Is Laundering'] == 1]
df_negativos = df[df['Is Laundering'] == 0]
n_positivos = len(df_positivos)
n_negativos = int(1.5 * n_positivos)
df_negativos_reducidos = df_negativos.sample(n=n_negativos, random_state=42)
df_balanceado = pd.concat([df_positivos, df_negativos_reducidos])
df_balanceado = df_balanceado.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
X = df_balanceado.drop('Is Laundering', axis=1)
y = df_balanceado['Is Laundering']

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
# Hacer predicciones y evaluar el modelo
y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Guardar el pipeline completo en un archivo PKL
pipeline_path = '/content/drive/My Drive/Thesis/random_forest_pipeline.pkl'
with open(pipeline_path, 'wb') as file:
    pickle.dump(pipeline, file)
