# {{ cookiecutter.project_name }}

### **Project description:**

# 1. Environment Preparation

In [None]:
# Importing necessary packages. Import Regressors or Classifiers

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import janitor
import missingno
import pyreadr
import upsetplot
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error # Regression models
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, log_loss, accuracy_score # Classifier Models
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.svm import SVR, SVC

### Personalized functions

In [None]:
%load_ext autoreload
%autoreload

In [None]:
from {{ cookiecutter.project_module_name }}.utils.paths import *
%run ../{{ cookiecutter.project_module_name }}/utils/pandas-missing-extension.ipynb

### Personalizing plots

In [None]:
%matplotlib inline

sns.set_theme(
    style="whitegrid",
    rc = {"figure.figsize": (10, 10)},
    context='notebook'
    )

# 2. Loading data

# 3. Exploratory Data Analysis

### 3.1 Dataset info

In [None]:
df.info()

### 3.2 Dataset statistical measures

In [None]:
df.describe()

# 4. Missing values treatment

# 5. Feature Engineering

# 6. Data split

### 6.1 Predictor and objetive variables split

In [None]:
X = data_processed.drop('target', axis=1)
y = data_processed['target']

### 6.2 Training and test data split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Scaling variables

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 8. Initial Model Evaluation

### 8.1 Model definition

In [None]:
models_regression = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'XGBoost Regressor': XGBRegressor(random_state=42),
    'LightGBM Regressor': LGBMRegressor(random_state=42),
    'SVR': SVR()
}

models_classification = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest Classifier': RandomForestClassifier(random_state=42),
    'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42),
    'XGBoost Classifier': XGBClassifier(random_state=42),
    'LightGBM Classifier': LGBMClassifier(random_state=42),
    'SVC': SVC()
}

### 8.2 Training and evaluation of Regression Models

In [None]:
for name, model in models_regression.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    print(f'{name} Evaluation:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')
    print(f'R^2: {r2}')
    print(f'MAPE: {mape}\n')

### 8.2 Training and evaluation of Classifier Models

In [None]:
for name, model in models_classification.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else None
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else 'N/A'
    log_loss_value = log_loss(y_test, y_prob) if y_prob is not None else 'N/A'

    print(f'{name} Evaluation:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')
    print(f'AUC-ROC: {auc}')
    print(f'Log-Loss: {log_loss_value}\n')

# 9. Hyperparams optimization for the best model

### 9.1 Optimizing Regressor

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)
best_regressor = grid_search.best_estimator_

### 9.1 Optimizing Classifier

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)
best_classifier = grid_search.best_estimator_

# 10. Feature selection

### 10.1 Feature Importance based method

In [None]:
if 'best_regressor' in locals():
    feature_importances = best_regressor.feature_importances_
elif 'best_classifier' in locals():
    feature_importances = best_classifier.feature_importances_

feature_names = X_train.columns
feature_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
feature_df.sort_values(by='importance', ascending=False, inplace=True)

### 10.2 Importance Threshold based selection

In [None]:
threshold = 0.01  # Importance threshold
selected_features = feature_df[feature_df['importance'] > threshold]['feature'].tolist()
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Rescaling selected data
X_train_selected_scaled = scaler.fit_transform(X_train_selected)
X_test_selected_scaled = scaler.transform(X_test_selected)

# 11. Retraining and evaluation of the model with selected features

In [None]:
if 'best_regressor' in locals():
    best_regressor.fit(X_train_selected_scaled, y_train)
    y_pred = best_regressor.predict(X_test_selected_scaled)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    print(f'{name} Evaluation:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')
    print(f'R^2: {r2}')
    print(f'MAPE: {mape}\n')
    
elif 'best_classifier' in locals():
    best_classifier.fit(X_train_selected_scaled, y_train)
    y_pred = best_classifier.predict(X_test_selected_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else 'N/A'
    log_loss_value = log_loss(y_test, y_prob) if y_prob is not None else 'N/A'

    print(f'{name} Evaluation:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')
    print(f'AUC-ROC: {auc}')
    print(f'Log-Loss: {log_loss_value}\n')

# 12. Feature Engineering iteration

### 12.1 Review and create new features

### 12.2 Divide and preprocess again

In [None]:
X = data_processed.drop('target', axis=1)
y = data_processed['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 12.3 Rescaling again

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 12.4 Retraining and evaluating model with new features

In [None]:
if 'best_regressor' in locals():
    best_regressor.fit(X_train_scaled, y_train)
    y_pred = best_regressor.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    print(f'{name} Evaluation:')
    print(f'MSE: {mse}')
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')
    print(f'R^2: {r2}')
    print(f'MAPE: {mape}\n')
    
elif 'best_classifier' in locals():
    best_classifier.fit(X_train_scaled, y_train)
    y_pred = best_classifier.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else 'N/A'
    log_loss_value = log_loss(y_test, y_prob) if y_prob is not None else 'N/A'

    print(f'{name} Evaluation:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')
    print(f'AUC-ROC: {auc}')
    print(f'Log-Loss: {log_loss_value}\n')