### Prueba tecnica:
Problema 1: AutoML para Score de Crédito Basado en el Paper “AutoML: A Survey of the State-of-the-Art”

### Pasos clave
- Importing Libraries and DataSet
- Data Analysis
- Feature Engineering
- Model building using ML models
- Model building and prediction using Auto ML

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# # Modelling
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomF
#orestRegressor,AdaBoostRegressor
# from sklearn.svm import SVR
# from sklearn.linear_model import LinearRegression, Ridge,Lasso
# from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# from sklearn.model_selection import RandomizedSearchCV
# from catboost import CatBoostRegressor
# from xgboost import XGBRegressor
# import warnings

### Reading the DataSet

In [2]:
df= pd.read_csv("data/credit_data.csv")

In [3]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
df.isnull().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          30000 non-null  int64
 1   LIMIT_BAL                   30000 non-null  int64
 2   SEX                         30000 non-null  int64
 3   EDUCATION                   30000 non-null  int64
 4   MARRIAGE                    30000 non-null  int64
 5   AGE                         30000 non-null  int64
 6   PAY_0                       30000 non-null  int64
 7   PAY_2                       30000 non-null  int64
 8   PAY_3                       30000 non-null  int64
 9   PAY_4                       30000 non-null  int64
 10  PAY_5                       30000 non-null  int64
 11  PAY_6                       30000 non-null  int64
 12  BILL_AMT1                   30000 non-null  int64
 13  BILL_AMT2                   30000 non-null  int64
 14  BILL_A

In [6]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [7]:
len(df)

30000

#### Limpieza de datos


In [8]:
cols = ['EDUCATION', 'MARRIAGE', 'SEX']
for col in cols:
    x = df[col].unique()
    print(x)
    print('-------------------------')

[2 1 3 5 4 6 0]
-------------------------
[1 2 3 0]
-------------------------
[2 1]
-------------------------


### Ingenieria de Variables

##### Variables del historial de pago (PAY_0 a PAY_6):



Estas transformaciones convierten las 6 columnas (PAY_0 a PAY_6) en nuevas características más informativas, como:

Promedio de retrasos (avg_delay).

Máximo retraso (max_delay).

Meses con retrasos graves (months_significant_delay).

Variabilidad de pagos (std_delay).

Estas nuevas características son útiles porque ayudan a los modelos a comprender el comportamiento histórico del cliente de manera más eficiente.

In [9]:
variables = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

df['promedio_retraso'] = df[variables].mean(axis=1)

df['maximo_retraso'] = df[variables].max(axis=1)

df['minimo_retraso'] = df[variables].min(axis=1)

df['cantidad_meses_con_retraso'] = (df[variables] > 0).sum(axis=1)

##### Variables de facturas (BILL_AMT*) y pagos (PAY_AMT*):

Calcular:

Relación entre pago y factura (PAY_AMT* / BILL_AMT*).


In [10]:
pagos = ['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',]
facturas = ['BILL_AMT1', 'BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
new_columns = []
for i in range(len(pagos)):
    x = f'relacion_pago_factura_{i+1}'
    y = f'razon_deuda_{i+1}'
    df[x] = df[pagos[i]] / df[facturas[i]]
    df[y] = df[facturas[i]] / df["LIMIT_BAL"]
    new_columns.append(x)
    new_columns.append(y)

In [11]:
df[["PAY_AMT1", "BILL_AMT1", "relacion_pago_factura_1"]]
df[["PAY_AMT4", "BILL_AMT4", "relacion_pago_factura_4"]]

Unnamed: 0,PAY_AMT4,BILL_AMT4,relacion_pago_factura_4
0,0,0,
1,1000,3272,0.305623
2,1000,14331,0.069779
3,1100,28314,0.038850
4,9000,20940,0.429799
...,...,...,...
29995,3047,88004,0.034623
29996,129,8979,0.014367
29997,4200,20878,0.201169
29998,1926,52774,0.036495


In [12]:
df.fillna(0, inplace=True)


In [13]:
print(new_columns)

['relacion_pago_factura_1', 'razon_deuda_1', 'relacion_pago_factura_2', 'razon_deuda_2', 'relacion_pago_factura_3', 'razon_deuda_3', 'relacion_pago_factura_4', 'razon_deuda_4', 'relacion_pago_factura_5', 'razon_deuda_5', 'relacion_pago_factura_6', 'razon_deuda_6']


### Preprocesamiento de datos

#### Standard Scaler

Escalado de características

Razón: Las variables numéricas (LIMIT_BAL, AGE, BILL_AMT*, PAY_AMT*) tienen rangos muy diferentes, lo que puede afectar el rendimiento de algunos modelos, como Ridge, Lasso o Random Forest.

Técnicas adecuadas:

Estandarización (StandardScaler): Escalar las variables para que tengan media 0 y desviación estándar 1.
Normalización (MinMaxScaler): Escalar las variables a un rango [0, 1] para métodos como XGBoost.

In [9]:
["MARRIAGE", "EDUCATION"]

#### One Hot Encoding

In [21]:
# Create Column Transformer with 3 types of transformers
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),    
    ]
)

In [22]:
X = preprocessor.fit_transform(X)

In [None]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

## Model Building using ML Algorithms


### We will be using the following models:
- XGB Regressor
- Support Vector Regressor

## XGB Regressor

### We will use Grid Search CV for finding the best parameters

In [None]:
from xgboost import XGBRegressor

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid= {'n_estimators':[2000,4000,6000],
             'max_depth':[3,4,5,6],
             'learning_rate':[0.01,0.1,0.2,0.3,0.5]
}

In [None]:
final = GridSearchCV(XGBRegressor(random_state=42),
                                    param_grid=param_grid,
                                    scoring='r2')

In [90]:
final.fit(X1_train,y1_train)

In [None]:
final.best_params_

In [None]:
Best_param= {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 2000}

In [None]:
xgb= XGBRegressor(learning_rate= 0.1,max_depth= 6, n_estimators= 2000)

In [None]:
xgb.fit(X_train,Y_train)

In [None]:
pred= xgb.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
mae = metrics.mean_absolute_error(pred, Y_test)
print(f'MAE: {mae:.5f}')

rsme = metrics.mean_squared_error(pred, Y_test, squared=False)
print(f'RMSE: {rsme:.5f}')

r2_score = metrics.r2_score(pred, Y_test)
print(f'r2 score: {r2_score:.5f}')

# --------------------------------------------------------

In [48]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

In [None]:
import pandas as pd

# Diccionario con modelos y sus instancias
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

# Diccionario de abreviaciones para cada modelo
model_abbreviations = {
    "Linear Regression": "LR",
    "Lasso": "Lasso",
    "Ridge": "Ridge",
    "K-Neighbors Regressor": "KNR",
    "Decision Tree": "DT",
    "Random Forest Regressor": "RFR",
    "XGBRegressor": "XGB",
    "CatBoosting Regressor": "CBR",
    "AdaBoost Regressor": "ABR"
}


# Listas para almacenar los resultados
model_data = []

for model_name, model in models.items():
    # Entrenar el modelo
    model.fit(X_train, y_train)
    new_name = model_abbreviations[model_name]
    # Hacer predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluar rendimiento
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    # Añadir resultados al DataFrame
    model_data.append({'Modelo': model_name, 'Abrev': new_name, 'Conjunto': 'Train', 'Métrica': 'MAE', 'Rendimiento': model_train_mae})
    model_data.append({'Modelo': model_name,'Abrev': new_name, 'Conjunto': 'Train', 'Métrica': 'RMSE', 'Rendimiento': model_train_rmse})
    model_data.append({'Modelo': model_name,'Abrev': new_name, 'Conjunto': 'Train', 'Métrica': 'R2', 'Rendimiento': model_train_r2})

    model_data.append({'Modelo': model_name,'Abrev': new_name, 'Conjunto': 'Test', 'Métrica': 'MAE', 'Rendimiento': model_test_mae})
    model_data.append({'Modelo': model_name,'Abrev': new_name, 'Conjunto': 'Test', 'Métrica': 'RMSE', 'Rendimiento': model_test_rmse})
    model_data.append({'Modelo': model_name,'Abrev': new_name, 'Conjunto': 'Train', 'Métrica': 'R2', 'Rendimiento': model_test_r2})

# Crear DataFrame
models = pd.DataFrame(model_data).sort_values(["Modelo", "Métrica"])


In [None]:
models

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filtrar el DataFrame para obtener solo las métricas MAE
mae_df = models[models['Métrica'] == 'MAE'].sort_values(["Métrica"], ascending=False)

# Crear una gráfica de líneas
plt.figure(figsize=(14, 8))
sns.lineplot(x='Abrev', y='Rendimiento', hue='Conjunto', marker='o', data=mae_df, palette='viridis')

# Añadir etiquetas y título
plt.title('Comparación del MAE entre Conjunto de Entrenamiento')


In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_test']).sort_values(by=["R2_test"],ascending=False)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

np.random.seed(0)
X = 2 * np.random.rand(100, 1)  
y = 4 + 3 * X + np.random.randn(100, 1)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

print(f"Intercepto: {model.intercept_[0]}")
print(f"Coeficiente: {model.coef_[0][0]}")

plt.scatter(X_test, y_test, color='blue', label='Datos Reales')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Línea de Regresión')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Regresión Lineal')
plt.legend()
plt.show()

In [None]:
path = "file:/c:/Users/AngelClavellina/Documents/MLOps/TrafficViolationsCDMX/artifacts/train.csv"
