In [90]:
# Loading Libs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [92]:
# loading Datas
df_train = pd.read_csv('train_data.csv')
df_valid = pd.read_csv('valid_data.csv')

In [94]:
y = df_train['target']
x = df_train.drop(columns=[df_train.columns[0],'target'])
x.fillna(0, inplace=True)

In [96]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [98]:
def model_evaluation(model, x_test, y_test, model_name):
    y_pred = model.predict(x_test)
    y_pred = [1 if pred >= 0.5 else 0 for pred in y_pred]
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    auc_roc = metrics.roc_auc_score(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    
    return pd.DataFrame([precision, recall, f1, auc_roc, accuracy], index=['Precision', 'Recall', 'F1' ,'AUC_ROC', 'Accuracy'], columns=[model_name])

In [100]:
#Decision Tree Regressor Model with gridsearch
tree_regressor = DecisionTreeRegressor(max_depth=15, min_samples_split=5, min_samples_leaf=2).fit(x_train, y_train)
print(model_evaluation(tree_regressor, x_train, y_train, 'Decision Tree Regressor Train'))
print(model_evaluation(tree_regressor, x_test, y_test, 'Decision Tree Regressor Test'))

           Decision Tree Regressor Train
Precision                       0.855857
Recall                          0.707048
F1                              0.774369
AUC_ROC                         0.839895
Accuracy                        0.923252
           Decision Tree Regressor Test
Precision                      0.540832
Recall                         0.443602
F1                             0.487415
AUC_ROC                        0.677812
Accuracy                       0.823322


In [101]:
#Decision Tree Regressor Model with gridsearch
param_grid = {
    'max_depth': [12, 15, 20],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [2, 5, 10]
}

# Use DecisionTreeClassifier instead of DecisionTreeRegressor
tree_classifier = DecisionTreeClassifier(
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2
)

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=tree_classifier, param_grid=param_grid,
                           scoring='f1', cv=5, verbose=1, return_train_score=True)

# Fit to the training data
grid_search.fit(x_train, y_train)

# Get the best model with optimal hyperparameters
best_tree_classifier = grid_search.best_estimator_

# Evaluate the model on train and test sets
print(model_evaluation(best_tree_classifier, x_train, y_train, 'Decision Tree Classifier Train'))
print(model_evaluation(best_tree_classifier, x_test, y_test, 'Decision Tree Classifier Test'))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
           Decision Tree Classifier Train
Precision                        0.850852
Recall                           0.776707
F1                               0.812091
AUC_ROC                          0.872771
Accuracy                         0.933048
           Decision Tree Classifier Test
Precision                       0.557092
Recall                          0.496367
F1                              0.524979
AUC_ROC                         0.702091
Accuracy                        0.829903


In [102]:
#Decision Tree Classifier Model
tree_classifier = DecisionTreeClassifier().fit(x_train, y_train)
print(model_evaluation(tree_classifier, x_train, y_train, 'Decision Tree Classifier Train'))
print(model_evaluation(tree_classifier, x_test, y_test, 'Decision Tree Classifier Test'))

           Decision Tree Classifier Train
Precision                        0.998335
Recall                           0.990639
F1                               0.994472
AUC_ROC                          0.995130
Accuracy                         0.997949
           Decision Tree Classifier Test
Precision                       0.561221
Recall                          0.580727
F1                              0.570807
AUC_ROC                         0.737334
Accuracy                        0.834630


In [103]:
#Decision Tree Classifier Model with gridsearch
param_grid = {
    'max_depth': [12, 15, 20], 
    'min_samples_split': [10, 20], 
    'min_samples_leaf': [2, 5, 10]
}

tree_classifier = DecisionTreeClassifier(
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',  # or use a specific dictionary like {0:1, 1:2} for class imbalance
    random_state=42
)

grid_search = GridSearchCV(estimator=tree_classifier, param_grid=param_grid, 
                           scoring='f1', cv=5, verbose=1, return_train_score=True)

# Fit to the training data
grid_search.fit(x_train, y_train)

# Get the best model with optimal hyperparameters
best_tree_classifier = grid_search.best_estimator_

# Step 2: Evaluate the tuned model on the test set
test_eval = model_evaluation(best_tree_classifier, x_test, y_test, 'Tuned Decision Tree Classifier Test')

# Evaluate the tuned model on the training set (for comparison)
train_eval = model_evaluation(best_tree_classifier, x_train, y_train, 'Tuned Decision Tree Classifier Train')

# Display results
print(test_eval)
print(train_eval)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
           Tuned Decision Tree Classifier Test
Precision                             0.466900
Recall                                0.675197
F1                                    0.552054
AUC_ROC                               0.747555
Accuracy                              0.792509
           Tuned Decision Tree Classifier Train
Precision                              0.688145
Recall                                 0.990914
F1                                     0.812232
AUC_ROC                                0.944061
Accuracy                               0.914662


In [105]:
#Random Forest Classifier Model
model1 = RandomForestClassifier().fit(x_train, y_train)
print(model_evaluation(model1, x_train, y_train, 'Random Forest Classifier Train'))
print(model_evaluation(model1, x_test, y_test, 'Random Forest Classifier Test'))

           Random Forest Classifier Train
Precision                        0.995995
Recall                           0.992841
F1                               0.994416
AUC_ROC                          0.995964
Accuracy                         0.997923
           Random Forest Classifier Test
Precision                       0.898030
Recall                          0.489731
F1                              0.633817
AUC_ROC                         0.738371
Accuracy                        0.892844


In [106]:
#Gradient Boosting Classifier Model
model2 = GradientBoostingClassifier(max_depth=15, min_samples_split=5, min_samples_leaf=2).fit(x_train, y_train)
print(model_evaluation(model2, x_train, y_train, 'Gradient Boosting Classifier Train'))
print(model_evaluation(model2, x_test, y_test, 'Gradient Boosting Classifier Test'))

           Gradient Boosting Classifier Train
Precision                            0.996270
Recall                               0.992704
F1                                   0.994484
AUC_ROC                              0.995926
Accuracy                             0.997949
           Gradient Boosting Classifier Test
Precision                           0.864012
Recall                              0.527962
F1                                  0.655423
AUC_ROC                             0.754276
Accuracy                            0.894879


In [107]:
#Random Forest Regressor Model
Random_Forest_Regressor = RandomForestRegressor().fit(x_train, y_train)
print(model_evaluation(Random_Forest_Regressor, x_train, y_train, 'Random Forest Regressor Train'))
print(model_evaluation(Random_Forest_Regressor, x_test, y_test, 'Random Forest Regressor Test'))

           Random Forest Regressor Train
Precision                       0.995995
Recall                          0.992841
F1                              0.994416
AUC_ROC                         0.995964
Accuracy                        0.997923
           Random Forest Regressor Test
Precision                      0.864371
Recall                         0.523539
F1                             0.652105
AUC_ROC                        0.752175
Accuracy                       0.894220


In [108]:
#Gradient Boosting Regressor Model
Gradient_Boosting_Regressor = GradientBoostingRegressor(max_depth=15, min_samples_split=5, min_samples_leaf=2).fit(x_train, y_train)
print(model_evaluation(Gradient_Boosting_Regressor, x_train, y_train, 'Gradient Boosting Regressor Train'))
print(model_evaluation(Gradient_Boosting_Regressor, x_test, y_test, 'Gradient Boosting Regressor Test'))

           Gradient Boosting Regressor Train
Precision                           0.996680
Recall                              0.992015
F1                                  0.994342
AUC_ROC                             0.995630
Accuracy                            0.997897
           Gradient Boosting Regressor Test
Precision                          0.802548
Recall                             0.557346
F1                                 0.657841
AUC_ROC                            0.762657
Accuracy                           0.890212


In [109]:
#Logistic Regression with gridsearch
scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'penalty': ['l2'],  # L2 regularization (ridge)
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse regularization strength
    'solver': ['saga'],  # saga is better for larger datasets, supports L2 regularization
    'max_iter': [1000]  # Increase the max iterations for convergence
}

# Initialize and run GridSearchCV with Logistic Regression
logistic_reg = GridSearchCV(
    estimator=LogisticRegression(random_state=1234, n_jobs=-1),
    param_grid=param_grid_lr,
    verbose=1,
    cv=10,
    n_jobs=-1
).fit(x_train_scaled, y_train)

# Evaluate the performance on both the train and test datasets
print(model_evaluation(logistic_reg, x_train_scaled, y_train, 'Logistic Regression Train'))
print(model_evaluation(logistic_reg, x_test_scaled, y_test, 'Logistic Regression Test'))

Fitting 10 folds for each of 5 candidates, totalling 50 fits
           Logistic Regression Train
Precision                   0.567500
Recall                      0.125000
F1                          0.204874
AUC_ROC                     0.551597
Accuracy                    0.819273
           Logistic Regression Test
Precision                  0.582712
Recall                     0.123539
F1                         0.203858
AUC_ROC                    0.551436
Accuracy                   0.817279


In [110]:
#Linear Regression Model
linear_reg = LinearRegression().fit(x_train_scaled, y_train)
print(model_evaluation(linear_reg, x_train_scaled, y_train, 'Linear Regression Train'))
print(model_evaluation(linear_reg, x_test_scaled, y_test, 'Linear Regression Test'))

           Linear Regression Train
Precision                 0.575221
Recall                    0.062638
F1                        0.112973
AUC_ROC                   0.526025
Accuracy                  0.816785
           Linear Regression Test
Precision                0.611801
Recall                   0.062243
F1                       0.112991
AUC_ROC                  0.526509
Accuracy                 0.814946


In [111]:
# Let's use Polynomial Features to make our result better
poly = PolynomialFeatures(degree=2)

# Create a pipeline that first creates polynomial features then applies Linear Regression
poly_linear_reg = make_pipeline(poly, LinearRegression()).fit(x_train_scaled, y_train)
print(model_evaluation(poly_linear_reg, x_train_scaled, y_train, 'Linear Regression Train'))
print(model_evaluation(poly_linear_reg, x_test_scaled, y_test, 'Linear Regression Test'))

           Linear Regression Train
Precision                 0.663246
Recall                    0.195760
F1                        0.302296
AUC_ROC                   0.586504
Accuracy                  0.831684
           Linear Regression Test
Precision                0.657503
Recall                   0.181359
F1                       0.284299
AUC_ROC                  0.579645
Accuracy                 0.827091


In [112]:
#One of the best model that showed most accuracy is Random Forest Regressor 
y1 = df_valid['target']
x1 = df_valid.drop(columns=[df_valid.columns[0],'target'])
x1.fillna(0, inplace=True) 

In [113]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.3, random_state=42)

In [114]:
def model_evaluation(model1, x1_test, y1_test, model_name1):
    y1_pred = model1.predict(x1_test)
    y1_pred = [1 if pred >= 0.5 else 0 for pred in y1_pred]
    precision = metrics.precision_score(y1_test, y1_pred)
    recall = metrics.recall_score(y1_test, y1_pred)
    f1 = metrics.f1_score(y1_test, y1_pred)
    auc_roc = metrics.roc_auc_score(y1_test, y1_pred)
    accuracy = metrics.accuracy_score(y1_test, y1_pred)
    
    return pd.DataFrame([precision, recall, f1, auc_roc, accuracy], index=['Precision', 'Recall', 'F1' ,'AUC_ROC', 'Accuracy'], columns=[model_name1])

In [136]:
Random_Forest_Regressor_Valid = RandomForestRegressor().fit(x1_train, y1_train)
print(model_evaluation(Random_Forest_Regressor_Valid, x1_train, y1_train, 'Random Forest Regressor Valid Train'))
print(model_evaluation(Random_Forest_Regressor_Valid, x1_test, y1_test, 'Random Forest Regressor Valid Test'))

           Random Forest Regressor Valid Train
Precision                             0.993697
Recall                                0.990576
F1                                    0.992134
AUC_ROC                               0.994935
Accuracy                              0.998415
           Random Forest Regressor Valid Test
Precision                            0.848214
Recall                               0.479798
F1                                   0.612903
AUC_ROC                              0.735254
Accuracy                             0.940828


In [138]:
# Second model is Gradient Boosting Regressor
Gradient_Boosting_Regressor = GradientBoostingRegressor(max_depth=15, min_samples_split=5, min_samples_leaf=2).fit(x1_train, y1_train)
print(model_evaluation(Gradient_Boosting_Regressor, x1_train, y1_train, 'Gradient Boosting Regressor Train'))
print(model_evaluation(Gradient_Boosting_Regressor, x1_test, y1_test, 'Gradient Boosting Regressor Test'))

           Gradient Boosting Regressor Train
Precision                           0.993697
Recall                              0.990576
F1                                  0.992134
AUC_ROC                             0.994935
Accuracy                            0.998415
           Gradient Boosting Regressor Test
Precision                          0.693333
Recall                             0.525253
F1                                 0.597701
AUC_ROC                            0.750058
Accuracy                           0.930966


In [139]:
# Third is Random Forest Classifier
Random_Forest_Classifier = RandomForestClassifier().fit(x1_train, y1_train)
print(model_evaluation(Random_Forest_Classifier, x1_train, y1_train, 'Random Forest Classifier Train'))
print(model_evaluation(Random_Forest_Classifier, x1_test, y1_test, 'Random Forest Classifier Test'))

           Random Forest Classifier Train
Precision                        0.992662
Recall                           0.991623
F1                               0.992142
AUC_ROC                          0.995400
Accuracy                         0.998415
           Random Forest Classifier Test
Precision                       0.888889
Recall                          0.464646
F1                              0.610282
AUC_ROC                         0.729181
Accuracy                        0.942061


In [140]:
# Fourth is Gradient Boosting Classifier
Gradient_Boosting_Classifier = GradientBoostingClassifier(max_depth=15, min_samples_split=5, min_samples_leaf=2).fit(x1_train, y1_train)
print(model_evaluation(Gradient_Boosting_Classifier, x1_train, y1_train, 'Gradient Boosting Classifier Train'))
print(model_evaluation(Gradient_Boosting_Classifier, x1_test, y1_test, 'Gradient Boosting Classifier Test'))

           Gradient Boosting Classifier Train
Precision                            0.993697
Recall                               0.990576
F1                                   0.992134
AUC_ROC                              0.994935
Accuracy                             0.998415
           Gradient Boosting Classifier Test
Precision                           0.843478
Recall                              0.489899
F1                                  0.619808
AUC_ROC                             0.740031
Accuracy                            0.941321
