In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("LoanData.csv")
df.head()

In [None]:
df.info()

# Data Understanding

In [None]:
df = df.rename(columns = {'ed':'education_level'})


In [None]:
df.columns

In [None]:
columns = ['age', 'education_level', 'employ', 'address', 'income', 'debtinc',
       'creddebt', 'othdebt']

for i in columns:
    print(i,':',df[i].min())
    print(i,':',df[i].max())

In [None]:
df['age'].unique()

In [None]:
df['education_level'].unique()

In [None]:
df['education_level'].value_counts()

In [None]:
df['employ'].unique()

In [None]:
df['address'].unique()

In [None]:
df['default'].unique()

In [None]:
df['default'].value_counts()

In [None]:
continous = ['age', 'education_level', 'employ', 'address', 'income', 'debtinc',
       'creddebt', 'othdebt']
discrete_categorical = ['default']
discrete_count = ['education_level', 'employ', 'address']

# EDA

### For continous variables

In [None]:

df[continous].describe()

In [None]:

df[continous].skew()

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
cols = ['age','income','debtinc','creddebt','othdebt']
plt.rcParams['figure.figsize'] = (18,10)
for i,col in enumerate(cols, start=1):
    plt.subplot(2,3,i)
    sns.histplot(df[col],kde=True)
    
plt.suptitle("Univariate Analysis on Numerical Columns")
#plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
sns.pairplot(df[continous])
plt.show()

In [None]:
sns.heatmap(df[continous].corr(), annot = True)
plt.show()

In [None]:
df[continous].corr()

In [None]:
outcols = ['age','income','debtinc','creddebt','othdebt']

for i, col in enumerate(outcols, start=1):
    plt.subplot(2,3,i)
    sns.boxplot(df[col])
    plt.title(col)
    
plt.suptitle("Outliers in the Data")
plt.show()

### For discrete variabls

In [None]:
df[discrete_categorical].describe()

In [None]:
outcols = ['education_level', 'employ', 'address']
plt.rcParams['figure.figsize'] = (18,5)
for i, col in enumerate(outcols, start=1):
    plt.subplot(1,3,i)
    sns.histplot(df[col], kde = True)
    plt.title(col)
    
plt.suptitle("Discrete variable analysis")
plt.show()

# Data Preparation

In [None]:
df['default'] = df['default'].replace({'1':1, '0':0, "'0'":0, ':0':0})

In [None]:
df['default'].value_counts()

In [None]:
#df['age_group'] = pd.cut(df['age'], bins=[0, 12, 18, 35, 60, 137], labels=['child', 'teen', 'young_adult', 'adult', 'senior'])
#df.drop(columns = 'age', inplace = True)
#df.head()

In [None]:
df.isnull().sum()/len(df)*100

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)
df[['income']] = imputer.fit_transform(df[['income']])
df[['age']] = imputer.fit_transform(df[['age']])

In [None]:
df['education_level'].value_counts()

In [None]:
mode_value = df['education_level'].mode()[0]
df['education_level'].fillna(mode_value, inplace=True)


In [None]:
df['education_level'].replace(5.0, 1.0, inplace=True)

In [None]:
#df['education_level'] = df['education_level'].astype('category')
#df['education_level'] = df['education_level'].cat.rename_categories({1: 'High School', 2: 'Undergraduate', 3: 'Graduate', 4: 'Postgraduate'})


In [None]:
df['total_debt'] = df['creddebt'] + df['othdebt']
df['debt_to_income_ratio'] = df['total_debt'] / df['income']
df['employ_to_age_ratio'] = df['employ'] / df['age']

In [None]:
selected_features= ['employ', 'debtinc', 'debt_to_income_ratio', 'employ_to_age_ratio']
for i, feature1 in enumerate(selected_features):
    for feature2 in selected_features[i+1:]:
        df[f'{feature1}_x_{feature2}'] = df[feature1] * df[feature2]

#Create polynomial features (degree=2)
for feature in selected_features:
    df[f'{feature}_squared'] = df[feature] ** 2


In [None]:
#Group by `employ` and calculate mean and median of `debt_to_income_ratio`
agg_stats = df.groupby('employ')['debt_to_income_ratio'].agg(['mean', 'median'])
agg_stats.columns = ['mean_debt_to_income_by_employ', 'median_debt_to_income_by_employ']


In [None]:
df = df.merge(agg_stats, on='employ', how='left')

In [None]:
df

In [None]:
#df['age_group'] = df['age_group'].fillna(df['age_group'].mode()[0])

In [None]:


#df = pd.get_dummies(df, columns=['education_level'], drop_first=True)

In [None]:
#df = pd.get_dummies(df, columns=['age_group'], drop_first=True)

In [None]:
df.columns

In [None]:
outcols = ['age', 'education_level', 'employ', 'address', 'income', 'debtinc',
       'creddebt', 'othdebt', 'total_debt', 'debt_to_income_ratio',
       'employ_to_age_ratio']

fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(20, 30))
axes = axes.flatten()
for i, col in enumerate(outcols):
    if col in df.columns:
        sns.boxplot(y=df[col], ax=axes[i])
        axes[i].set_title(f'Boxplot of {col}')
        axes[i].set_xlabel('')
    else:
        axes[i].axis('off')
plt.tight_layout()
plt.show()

In [None]:
#continous_variables = ['age', 'employ', 'address', 'income', 'debtinc',
#           'creddebt', 'othdebt', 'total_debt', 'debt_to_income_ratio',
#           'employ_to_age_ratio', 'employ_x_debtinc',
#           'employ_x_debt_to_income_ratio', 'employ_x_employ_to_age_ratio',
#           'debtinc_x_debt_to_income_ratio', 'debtinc_x_employ_to_age_ratio',
#           'debt_to_income_ratio_x_employ_to_age_ratio', 'employ_squared',
#           'debtinc_squared', 'debt_to_income_ratio_squared',
#           'employ_to_age_ratio_squared', 'mean_debt_to_income_by_employ',
#           'median_debt_to_income_by_employ']

In [None]:
df.skew()

In [None]:
from scipy import stats

columns_to_transform = ['age', 'employ', 'address', 'income', 'debtinc',
       'creddebt', 'othdebt', 'total_debt', 'debt_to_income_ratio',
       'employ_to_age_ratio']
transform_info = {}
# Apply Yeo-Johnson transformation to each column and calculate skewness
for col in columns_to_transform:
    df[col], lambda_value = stats.yeojohnson(df[col])
    skewness = df[col].skew()
    transform_info[col] = {'lambda': lambda_value, 'skewness': skewness}

In [None]:
df.skew()

In [None]:
df.columns

In [None]:
def cap_outliers(series, lower_percentile=1, upper_percentile=99):
    lower_bound = series.quantile(lower_percentile / 100)
    upper_bound = series.quantile(upper_percentile / 100)
    return np.clip(series, lower_bound, upper_bound)

# Define the numerical columns to be scaled
num_cols = ['age','employ', 'address', 'income', 'debtinc',
       'creddebt', 'othdebt', 'default', 'total_debt', 'debt_to_income_ratio',
       'employ_to_age_ratio']


# Cap outliers in the scaled numerical columns
for col in num_cols:
    df[col] = cap_outliers(df[col])


# Visualize the final transformed features
for col in num_cols:
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col} after Scaling, Capping, and Log Transformation')

    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col} after Scaling, Capping, and Log Transformation')
    plt.show()


# Print the first few rows to check the transformations
df.head()

In [None]:
df['age'] = df['age'].astype(int)
df['education_level'] = df['education_level'].astype(int)
df['employ'] = df['employ'].astype(int)
df['address'] = df['address'].astype(int)

In [None]:
X = df.drop('default', axis = 1)
y = df['default']

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [None]:
print("X_train_selected shape:", X_train.shape)
print("X_test_selected shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Print the distribution of target classes in train and test sets
print("\nDistribution of classes in y_train:")
print(y_train.value_counts(normalize=True))

print("\nDistribution of classes in y_test:")
print(y_test.value_counts(normalize=True))


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
print(y_train_resampled.value_counts(normalize=True))

# Train Test Split

In [None]:
# from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
sc = StandardScaler()
#mm = MinMaxScaler()
#rs = RobustScaler()
X_train_scaled = sc.fit_transform(X_train_resampled)
X_test_scaled = sc.transform(X_test)

# Logistic Regression

In [None]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components=0.96)

#X_train_pca = pca.fit_transform(X_train_scaled)
#X_test_pca = pca.transform(X_test_scaled)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train_scaled,y_train_resampled)

# prediction
ypred_train = lr.predict(X_train_scaled)
lr_ypred_test = lr.predict(X_test_scaled)
lr_y_prob_test = lr.predict_proba(X_test_scaled)[:, 1]

# Evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracy:",accuracy_score(y_test,lr_ypred_test))
from sklearn.model_selection import cross_val_score
print("cross_val_score:",cross_val_score(lr,X_train_scaled,y_train_resampled,cv=5).mean())

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix

print(classification_report(y_test, lr_ypred_test))

In [None]:
cm = confusion_matrix(y_test,lr_ypred_test)
cm

In [None]:
cm = confusion_matrix(y_test, lr_ypred_test)
plt.figure(figsize = (4,2))
sns.heatmap(cm, annot=True, fmt='d', cmap = 'Blues')
plt.xlabel('predicted')
plt.ylabel('True')
plt.title('confusion matrix')
plt.show()

In [None]:
y_test.value_counts()

In [None]:
from sklearn.metrics import roc_curve,auc,RocCurveDisplay
fpr, tpr, thresholds = roc_curve(y_test,lr_y_prob_test)
roc_auc = auc(fpr, tpr)
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
plt.show()

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_scaled,y_train_resampled)

# prediction
ypred_train = knn.predict(X_train_scaled)
knn_ypred_test = knn.predict(X_test_scaled)
knn_y_prob_test = knn.predict_proba(X_test_scaled)[:, 1]

# Evaluation
print("Train accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracy:",accuracy_score(y_test,knn_ypred_test))

print("cross_val_score:",cross_val_score(knn,X_train_scaled,y_train_resampled,cv=5).mean())

In [None]:
# Hyperparameter tuning
estimator = KNeighborsClassifier()
param_grid = {'n_neighbors':list(range(1,50))}

from sklearn.model_selection import GridSearchCV
cv_classifier = GridSearchCV(estimator,param_grid,cv=5,scoring='accuracy')

cv_classifier.fit(X_train_scaled,y_train_resampled)
cv_classifier.best_params_

### Finalizing the KNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled,y_train_resampled)

# prediction
ypred_train = knn.predict(X_train_scaled)
knn_ypred_test = knn.predict(X_test_scaled)
knn_y_prob_test = knn.predict_proba(X_test_scaled)[:, 1]

# Evaluation
print("Train accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracy:",accuracy_score(y_test,knn_ypred_test))

print("cross_val_score:",cross_val_score(knn,X_train_scaled,y_train_resampled,cv=5).mean())

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
print(classification_report(y_test, knn_ypred_test))

In [None]:
cm = confusion_matrix(y_test, knn_ypred_test)
cm


In [None]:
cm = confusion_matrix(y_test, knn_ypred_test)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot = True, fmt = 'd', cmap = "Blues")
plt.xlabel('predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.metrics import roc_curve,auc,RocCurveDisplay
fpr, tpr, thresholds = roc_curve(y_test,knn_y_prob_test)
roc_auc = auc(fpr, tpr)
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
plt.show()

In [None]:
y_test.value_counts()


# SVC

In [None]:
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_scaled,y_train_resampled)

# prediction
ypred_train = svm.predict(X_train_scaled)
svc_ypred_test = svm.predict(X_test_scaled)
svc_y_prob_test = svm.predict_proba(X_test_scaled)[:, 1]

# Evaluation
print("Train accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracy:",accuracy_score(y_test,svc_ypred_test))

print("cross_val_score:",cross_val_score(svm,X_train_scaled,y_train_resampled,cv=5).mean())

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

svc = SVC()

# 3. Parameter Grid (Key hyperparameters to tune)
param_grid = {
    'C': [0.5, 1, 5],               # Explore a wider range with smaller values
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1],   # Focus on smaller gamma values for RBF
}


# 4. Grid Search with Cross-Validation
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_resampled)

# 5. Best Model & Results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# 6. Evaluate on Test Set
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test)
print(classification_report(y_test, y_pred))


### Finalizing the SVM model

In [None]:
from sklearn.svm import SVC

svm = SVC(probability=True, C=0.8 ,gamma = 0.1, kernel = 'rbf')
#svm = SVC(probability=True, C = 0.5308342600258237, gamma = 0.5308342600258237, kernel = 'rbf')

svm.fit(X_train_scaled,y_train_resampled)

# prediction
ypred_train = svm.predict(X_train_scaled)
svc_ypred_test = svm.predict(X_test_scaled)
svc_y_prob_test = svm.predict_proba(X_test_scaled)[:, 1]

# Evaluation
print("Train accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracy:",accuracy_score(y_test,svc_ypred_test))

print("cross_val_score:",cross_val_score(svm,X_train_scaled,y_train_resampled,cv=5).mean())

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
print(classification_report(y_test, svc_ypred_test))

In [None]:
cm = confusion_matrix(y_test, svc_ypred_test)
cm

In [None]:
cm = confusion_matrix(y_test, svc_ypred_test)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot = True, fmt = 'd', cmap = "Blues")
plt.xlabel('predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.metrics import roc_curve,auc,RocCurveDisplay
fpr, tpr, thresholds = roc_curve(y_test,svc_y_prob_test)
roc_auc = auc(fpr, tpr)
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
plt.show()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train_scaled,y_train_resampled)

ypred_train = model.predict(X_train_scaled)
ypred_test = model.predict(X_test_scaled)
dt_y_prob_test = model.predict_proba(X_test_scaled)[:, 1]

print("Train Accuracy:",accuracy_score(ypred_train,y_train_resampled))
print("Test Accuracy:",accuracy_score(ypred_test,y_test))
print("Cross_val_score:",cross_val_score(model,X_train_scaled,y_train_resampled,cv=5).mean())

In [None]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

# Your data (X_train, y_train) should be loaded here

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 30), # Start small, grow large
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
    }

    model = DecisionTreeClassifier(**params)
    scorer = make_scorer(accuracy_score) 
    score = cross_val_score(model, X_train_scaled, y_train_resampled, cv=5, scoring=scorer).mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Adjust as needed

best_params = study.best_params
best_score = study.best_value
print("Best parameters:", best_params)
print("Best accuracy:", best_score)


In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth= 8, min_samples_split= 4, min_samples_leaf= 1, criterion= 'entropy', max_features= None, splitter= 'best')
dt.fit(X_train_scaled,y_train_resampled)

ypred_train = dt.predict(X_train_scaled)
ypred_test = dt.predict(X_test_scaled)
dt_y_prob_test = dt.predict_proba(X_test_scaled)[:, 1]

print("Train Accuracy:",accuracy_score(ypred_train,y_train_resampled))
print("Test Accuracy:",accuracy_score(ypred_test,y_test))
print("Cross_val_score:",cross_val_score(dt,X_train_scaled,y_train_resampled,cv=5).mean())

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_resampled,y_train_resampled)

ypred_train = rf.predict(X_train_resampled)
ypred_test = rf.predict(X_test)
rf_y_prob_test = rf.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(ypred_train,y_train_resampled))
print("Test Accuracy:",accuracy_score(ypred_test,y_test))

from sklearn.model_selection import cross_val_score
print("Cross_val_score:",cross_val_score(rf,X_train_resampled,y_train_resampled,cv=5).mean())

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer  

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 15), 
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'ccp_alpha': trial.suggest_float('ccp_alpha', 1e-5, 1e-1, log=True),  # Cost complexity pruning
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100)       # Limit leaf nodes
    }
    # Conditional sampling for max_samples
    if params['bootstrap']:
        params['max_samples'] = trial.suggest_uniform('max_samples', 0.6, 0.95)  # Tighter range
    else:
        params['max_samples'] = None 

 
    model = RandomForestClassifier(**params, random_state=42)
    scorer = make_scorer(accuracy_score)
    score = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring=scorer).mean()
    return score
    
    # Add a penalty for complex models
    complexity_penalty = 0.01 * params['n_estimators'] + 0.05 * params['max_depth']
    return score - complexity_penalty

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_score = study.best_value
print("Best parameters:", best_params)
print("Best accuracy:", best_score)

# ... (rest of the code as before)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(random_state=42)  # Set random_state for reproducibility

# 3. Parameter Grid (Stronger regularization, varied tree depth/number)
param_grid = {
    'n_estimators': [100, 200, 300],            # Try more trees
    'max_depth': [None, 10, 15],               # Allow for deeper trees
    'min_samples_split': [20, 30, 50],          # Slightly less restrictive splitting
    'min_samples_leaf': [10, 15, 20],          # Slightly less restrictive leaves
    'max_features': ['sqrt', 0.75, 'auto'],     # Consider more features
    'criterion': ['gini', 'entropy'],
}

# 4. Grid Search with Cross-Validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_resampled)

# 5. Best Model and Evaluation (Include test set accuracy for assessment)
best_rf = grid_search.best_estimator_

y_pred_test = best_rf.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Output (Always print to assess results, even if overfitting occurs)
print("Test Accuracy:", test_accuracy)
print("Best parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 241, max_depth= 10, min_samples_split= 4, min_samples_leaf= 1, max_features= 'log2', bootstrap= True, 
                            criterion= 'entropy', ccp_alpha= 0.0010399525940246316, max_leaf_nodes= 87, max_samples= 0.79545993040379)
rf.fit(X_train_scaled,y_train_resampled)

ypred_train = rf.predict(X_train_scaled)
ypred_test = rf.predict(X_test_scaled)
rf_y_prob_test = rf.predict_proba(X_test_scaled)[:, 1]

from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(ypred_train,y_train_resampled))
print("Test Accuracy:",accuracy_score(ypred_test,y_test))

from sklearn.model_selection import cross_val_score
print("Cross_val_score:",cross_val_score(rf,X_train_scaled,y_train_resampled,cv=5).mean())

# Adaboost classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abmodel = AdaBoostClassifier(n_estimators= 401, learning_rate= 0.9930119742426293)
abmodel.fit(X_train_resampled,y_train_resampled)

ypred_train = abmodel.predict(X_train_resampled)
ypred_test = abmodel.predict(X_test_scaled)

from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracty:",accuracy_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
print("cross val score:",cross_val_score(abmodel,X_train_resampled,y_train_resampled,cv=5).mean())

In [None]:
import optuna
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier  # Common base estimator for AdaBoost
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

# Your data (X_train, y_train) should be loaded here

def objective(trial):
    base_estimator = DecisionTreeClassifier(
        max_depth=trial.suggest_int('max_depth', 1, 8),  
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        ccp_alpha= trial.suggest_float('ccp_alpha', 1e-5, 1.0, log=True),  # Larger range
        max_leaf_nodes= trial.suggest_int('max_leaf_nodes', 2, 20),
    )
    
    # AdaBoost Parameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True), 
        'algorithm': trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']),
    }

    model = AdaBoostClassifier(base_estimator, **params, random_state=42) # Corrected line
    scorer = make_scorer(accuracy_score)
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring=scorer).mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Adjust as needed

best_params = study.best_params
best_score = study.best_value
print("Best parameters:", best_params)
print("Best accuracy:", best_score)


### GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbmodel = GradientBoostingClassifier()
gbmodel.fit(X_train,y_train)

ypred_train = gbmodel.predict(X_train)
ypred_test = gbmodel.predict(X_test)

from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train,ypred_train))
print("Test Accuracty:",accuracy_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
print("cross val score:",cross_val_score(gbmodel,X_train,y_train,cv=5).mean())

### Hyperparameter tuning 

In [None]:

from sklearn.model_selection import cross_val_score

param_distributions = {
    'n_estimators': [50, 100, 150],    
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 4],            
    'subsample': [0.7, 0.8, 0.9],      
    'min_samples_split': [5, 10, 15],  
    'min_samples_leaf': [2, 5, 10],    
    'max_features': ['auto', 'sqrt'],  
    'n_iter_no_change': [10, 20],      
    'tol': [1e-4]
}
# Create a Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42, validation_fraction=0.1, n_iter_no_change=10, tol=1e-4)
# Create RandomizedSearchCV Object
random_search = RandomizedSearchCV(
    gb_clf, param_distributions=param_distributions, n_iter=20, cv=5, scoring='roc_auc', random_state=42
)
# Fit the model
random_search.fit(X_train_scaled, y_train_resampled)
# Print the best parameters and results
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)
y_pred = random_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, random_search.predict_proba(X_test)[:, 1])

print("Best hyperparameters:", random_search.best_estimator_.get_params())
print("Accuracy:", accuracy)
print("ROC AUC:", roc_auc)


scores = cross_val_score(random_search.best_estimator_, X_train_scaled, y_train_resampled, cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())


### Finilaizing the Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbmodel = GradientBoostingClassifier(ccp_alpha= 0.0, criterion= 'friedman_mse', init= None, learning_rate= 0.1, loss= 'log_loss', 
                                     max_depth= 4, max_features= 'sqrt', max_leaf_nodes= None, min_impurity_decrease= 0.0, 
                                     min_samples_leaf= 5, min_samples_split= 15, min_weight_fraction_leaf= 0.0, n_estimators= 100,
                                     n_iter_no_change= 20, random_state= 42,
                                     subsample= 0.7, tol= 0.0001, validation_fraction= 0.1, verbose= 0, warm_start= False)
gbmodel.fit(X_train_scaled,y_train_resampled)

ypred_train = gbmodel.predict(X_train_scaled)
ypred_test = gbmodel.predict(X_test_scaled)

from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracty:",accuracy_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
print("cross val score:",cross_val_score(gbmodel,X_train_scaled,y_train_resampled,cv=5).mean())

### XGBClassifier

In [None]:
from xgboost import XGBClassifier
xgbmodel = XGBClassifier()
xgbmodel.fit(X_train_resampled,y_train_resampled)

ypred_train = xgbmodel.predict(X_train_resampled)
ypred_test = xgbmodel.predict(X_test)

from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracty:",accuracy_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
print("cross val score:",cross_val_score(xgbmodel,X_train_resampled,y_train_resampled,cv=5).mean())

### CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier
cbbmodel = CatBoostClassifier()
cbbmodel.fit(X_train_resampled,y_train_resampled)
ypred_train = cbbmodel.predict(X_train_resampled)
ypred_test = cbbmodel.predict(X_test)

from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracty:",accuracy_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
print("cross val score:",cross_val_score(cbbmodel,X_train_resampled,y_train_resampled,cv=5).mean())

In [None]:
print("Train Accuracy:",accuracy_score(y_train_resampled,ypred_train))
print("Test Accuracty:",accuracy_score(y_test,ypred_test))

# CNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)  # Assuming one feature per time step
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# 2. Define Your CNN Model (with given parameters)
model = keras.Sequential([
    Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),

    # Dense layers based on the given hyperparameters
    Dense(32, activation='relu'),
    Dropout(0.3002),  # Your value: 'dropout_0'

    

    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# 3. Compile and Train the Model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001922),  # Your value: 'learning_rate'
              loss='binary_crossentropy', 
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

model.fit(X_train, y_train, epochs=50, batch_size=32)  # Assuming you found 50 epochs optimal

# 4. Evaluate the Model
loss, accuracy, precision, recall = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

# 5. Generate Predictions and Confusion Matrix
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(cm)

# 6. Visualize Confusion Matrix (Optional)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not Default', 'Default'],
            yticklabels=['Not Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


### Hyperparameter Tuing

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow import keras
from kerastuner.tuners import RandomSearch 

# Function to build CNN model with tunable hyperparameters
def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Conv1D(
        filters=hp.Int('filters', min_value=32, max_value=512, step=32),
        kernel_size=hp.Int('kernel_size', min_value=3, max_value=8, step=1),
        activation='relu', input_shape=(X_train.shape[1], 1)
    ))
    model.add(keras.layers.MaxPooling1D(pool_size=2))
    
    # Add more Conv1D and MaxPooling1D layers if needed
    
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(
        units=hp.Int('units', min_value=32, max_value=128, step=32),
        activation='relu'
    ))
    model.add(keras.layers.Dropout(rate=hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Create the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',  # Metric to optimize
    max_trials=20,            # Number of hyperparameter combinations to try
    executions_per_trial=3,   # Number of models to train per combination (to reduce noise)
    directory='my_dir',
    project_name='loan_cnn_tuning'
    overwrite = True 
)

# Early stopping
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Start the search
tuner.search(X_train_resampled, y_train_resampled, epochs=10, validation_data=(X_test, y_test), callbacks=[stop_early])

# Get the best model and its hyperparameters
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best Hyperparameters:", best_hyperparameters.values)

# ... (Evaluate the best model on test data as before)


# Final model with CNN

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam




# Build the CNN model with the provided best hyperparameters
model = Sequential()
model.add(Conv1D(filters=384, kernel_size=5, activation='relu', padding='same', input_shape=(X_train.shape[1], 1)))  
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=256, kernel_size=7, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))

# Flatten before Dense layers
model.add(Flatten())

# Dense Layers (2 layers as per hyperparameters)
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.23003726126036944))  # Using the specific dropout rate
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.48827949663688364))  # Using the specific dropout rate

# Output Layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.00018285893202451146),  # Use the specific learning rate
    loss='binary_crossentropy', 
    metrics=['accuracy']
)

# Train the model and store history
history = model.fit(
    X_train, 
    y_train, 
    epochs=20, 
    batch_size=1024, 
    validation_data=(X_test, y_test)
)

# Evaluate on training and test sets (reshape back to 2D)
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Train Loss: {train_loss}, Train Accuracy: {train_acc}")
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")


# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
