In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import optuna
import shap
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
import joblib

In [3]:
# Load the datasets
X_train = pd.read_csv(r"D:\Train_60\Train_60\Train_60\X_Train_Data_Input.csv")
Y_train = pd.read_csv(r"D:\Train_60\Train_60\Train_60\Y_Train_Data_Target.csv")
X_test = pd.read_csv(r"D:\Test_20\Test_20\Test_20\X_Test_Data_Input.csv")
Y_test = pd.read_csv(r"D:\Test_20\Test_20\Test_20\Y_Test_Data_Target.csv")


In [4]:
# Drop the 'ID' column from both X_train and X_test
X_train = X_train.drop(columns=['ID', 'target'])
X_test = X_test.drop(columns=['ID'])
Y_train = Y_train.drop(columns=['ID'])
Y_test = Y_test.drop(columns=['ID'])


In [5]:
# Impute missing values
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [6]:
# Convert back to DataFrame
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns)

In [7]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [8]:
# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [9]:
# Calculate VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X_train_scaled.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_scaled.values, i) for i in range(X_train_scaled.shape[1])]
print(vif_data)

     feature       VIF
0    Column0  1.054533
1    Column1  1.488827
2    Column2  1.103574
3    Column3  4.862719
4    Column4  5.178206
5    Column5  1.000114
6    Column6  1.165029
7    Column7  1.024339
8    Column8  1.261413
9    Column9  1.026054
10  Column10  4.106833
11  Column11  4.590756
12  Column12  3.894457
13  Column13  4.374858
14  Column14  1.000211
15  Column15  1.000613
16  Column16  1.020774
17  Column17  1.092544
18  Column18  1.249391
19  Column19  1.157325
20  Column20  1.140316
21  Column21  1.098497


In [10]:
# Drop Column4 due to high VIF
columns_to_drop = ['Column4']

X_train_reduced = X_train_scaled.drop(columns=columns_to_drop, axis=1)
X_test_reduced = X_test_scaled.drop(columns=columns_to_drop, axis=1)

# Recompute VIF after dropping the column to verify if it reduces multicollinearity
vif_data_reduced = pd.DataFrame()
vif_data_reduced["feature"] = X_train_reduced.columns
vif_data_reduced["VIF"] = [variance_inflation_factor(X_train_reduced.values, i) for i in range(X_train_reduced.shape[1])]
print(vif_data_reduced)

     feature       VIF
0    Column0  1.054531
1    Column1  1.397204
2    Column2  1.102766
3    Column3  1.099390
4    Column5  1.000113
5    Column6  1.164084
6    Column7  1.024330
7    Column8  1.260642
8    Column9  1.026039
9   Column10  4.106157
10  Column11  4.590508
11  Column12  3.889872
12  Column13  4.373771
13  Column14  1.000211
14  Column15  1.000597
15  Column16  1.020620
16  Column17  1.091214
17  Column18  1.242985
18  Column19  1.157106
19  Column20  1.139414
20  Column21  1.098446


In [11]:
# Split the data
X_train_reduced, X_val, Y_train, Y_val = train_test_split(X_train_reduced, Y_train, test_size=0.2, random_state=42)

In [12]:
# Optuna for hyperparameter tuning
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 200)
    max_depth = trial.suggest_int('max_depth', 10, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features
    )
    
    return cross_val_score(rf_model, X_train_reduced, Y_train, cv=5, n_jobs=-1).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5) 

best_params = study.best_params
final_rf_model = RandomForestClassifier(**best_params, random_state=42)
final_rf_model.fit(X_train_reduced, Y_train.values.ravel())

[I 2024-10-11 15:47:08,642] A new study created in memory with name: no-name-73ba9d1f-50d5-4fcc-ab35-701b86ba3ed5
[I 2024-10-11 15:50:34,281] Trial 0 finished with value: 0.9765135187684205 and parameters: {'n_estimators': 171, 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9765135187684205.
[I 2024-10-11 15:52:53,081] Trial 1 finished with value: 0.976895619637947 and parameters: {'n_estimators': 117, 'max_depth': 20, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.976895619637947.


In [None]:
# Make predictions
y_val_pred = final_rf_model.predict(X_val)

In [None]:
# Evaluate the model
accuracy = accuracy_score(Y_val, y_val_pred)
conf_matrix = confusion_matrix(Y_val, y_val_pred)
class_report = classification_report(Y_val, y_val_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

In [None]:
# Cross-validation scores
cv_scores = cross_val_score(final_rf_model, X_train_reduced, Y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

In [None]:
# ROC-AUC score
y_proba = final_rf_model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(Y_val, y_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")

In [None]:
# ROC Curve visualization
fpr, tpr, _ = roc_curve(Y_val, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
# Feature interactions or polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [None]:
# PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
# SHAP values for feature importance analysis
explainer = shap.Explainer(final_rf_model, X_train_reduced)
shap_values = explainer(X_val)
shap.summary_plot(shap_values, X_val)

In [None]:
# Partial Dependence Plot (PDP)
fig, ax = plt.subplots(figsize=(12, 6))
PartialDependenceDisplay.from_estimator(final_rf_model, X_train_reduced, [0, 1], ax=ax)

In [None]:
# Pipeline creation
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(**best_params, random_state=42))
])

pipeline.fit(X_train, Y_train.values.ravel())

In [None]:
# Save the model
joblib.dump(final_rf_model, 'final_rf_model.pkl')