In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Cargar las variables de entorno desde el archivo .env
load_dotenv()

# Acceder a la string de conexion desde el archivo .env
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

# Definir el string de la conexión
connection_string = f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

# Crear el SQLAlchemy engine
engine = create_engine(connection_string)

# Crear y manejar una conexión manual
connection = engine.connect()

# Cargar el dataset en un dataframe de Pandas
train = pd.read_sql("SELECT * FROM train", connection)
test = pd.read_sql("SELECT * FROM test", connection)

# Cerrar la conexión
connection.close()
print("PostgreSQL connection is closed.")

# Separate the features and target
X_train = train.drop(columns=['fraud_bool'])
y_train = train['fraud_bool']
X_test = test.drop(columns=['fraud_bool'])
y_test = test['fraud_bool']

PostgreSQL connection is closed.


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Crear el modelo
rf_model = RandomForestClassifier(random_state=42)

# Entrenar el modelo
rf_model.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred_rf = rf_model.predict(X_test)

# Evaluar el modelo
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    296691
           1       0.10      0.04      0.05      3309

    accuracy                           0.99    300000
   macro avg       0.54      0.52      0.52    300000
weighted avg       0.98      0.99      0.98    300000

Random Forest Confusion Matrix:
 [[295617   1074]
 [  3189    120]]


In [3]:
from xgboost import XGBClassifier

# Crear el modelo
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Entrenar el modelo
xgb_model.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred_xgb = xgb_model.predict(X_test)

# Evaluar el modelo
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    296691
           1       0.09      0.10      0.09      3309

    accuracy                           0.98    300000
   macro avg       0.54      0.54      0.54    300000
weighted avg       0.98      0.98      0.98    300000

XGBoost Confusion Matrix:
 [[293324   3367]
 [  2978    331]]


In [4]:
from sklearn.linear_model import LogisticRegression

# Crear el modelo
logreg_model = LogisticRegression(random_state=42, max_iter=1000)

# Entrenar el modelo
logreg_model.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred_logreg = logreg_model.predict(X_test)

# Evaluar el modelo
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.70      0.82    296691
           1       0.02      0.63      0.05      3309

    accuracy                           0.70    300000
   macro avg       0.51      0.67      0.43    300000
weighted avg       0.98      0.70      0.82    300000

Logistic Regression Confusion Matrix:
 [[208770  87921]
 [  1208   2101]]
