<a href="https://colab.research.google.com/github/SamTianshiMeng/Credit-data/blob/main/My_Project_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GROUP PROJECT ITM 618**



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
from google.colab import files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_set = pd.read_csv("/trainset.csv")
test_set = pd.read_csv("/testset.csv")

**Data Exploration**

In [None]:
print(train_set.head()) #overview of the first instances of our train and test sets
print(test_set.head())

In [None]:
train_set.describe() #description of the numerical atributes
test_set.describe()

In [None]:
train_set.dtypes # for seeing the datatypes of our variables

In [None]:
correlation_matrix = train_set.corr(numeric_only = True) #to observe the correlation among our numeric features
print(correlation_matrix)

In [None]:
train_set['age'].hist(bins=50) #age distribution
plt.show()

In [None]:
train_set['Subscribed'].value_counts() #number of users subsbribed vs not subscribed

In [None]:
sns.countplot(x='Subscribed', data=train_set) #target distribution
plt.show()

In [None]:
sns.boxplot(x='Subscribed', y='age', data=train_set) #boxplot subscribed, age
plt.show()

**Data Preprocessing**

In [None]:
#checking for missing values
train_set.isnull().sum()

One-hot encoding for categorical variables

In [None]:
categorical_vars = ['job', 'marital', 'education', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
train_set_encoded = pd.get_dummies(train_set, columns=categorical_vars)
test_set_encoded = pd.get_dummies(test_set, columns=categorical_vars)

Removing outliers

In [None]:
#numerical columns that are not part of the encoding
numerical_cols = [col for col in train_set_encoded.columns if col not in categorical_vars and train_set_encoded[col].dtype in ['int64', 'float64']]

#IQR for numerical columns
Q1 = train_set_encoded[numerical_cols].quantile(0.25)
Q3 = train_set_encoded[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

#removing the outliers
train_set_filtered = train_set_encoded[~((train_set_encoded[numerical_cols] < (Q1 - 1.5 * IQR)) | (train_set_encoded[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

Min-max Normalization

In [None]:
scaler = MinMaxScaler()
train_set_filtered[numerical_cols] = scaler.fit_transform(train_set_filtered[numerical_cols])
test_set_encoded[numerical_cols] = scaler.transform(test_set_encoded[numerical_cols])

**Building the models**

Decission Tree

In [None]:
#decission tree classifier, default parameters
dt_model = DecisionTreeClassifier()

#splitting train and test sets into features and target
x_train = train_set_filtered.drop('Subscribed', axis=1) #train only features
y_train = train_set_filtered['Subscribed'] #train only target
x_test = test_set_encoded.drop('Subscribed', axis=1) #test only features
y_test = test_set_encoded['Subscribed'] #test only target

#here we use the fit method to train our model
dt_model.fit(x_train, y_train)

#use the trained model to make predictions
dt_y_pred = dt_model.predict(x_test)

#tree visualization
plt.figure(figsize=(12,8))
plot_tree(dt_model, filled=True, feature_names= x_train.columns, class_names=['No', 'Yes'], rounded=True)
plt.show()

K-nearest neighbour

In [None]:
#knn classifier with k set to 7, initially
knn_model = KNeighborsClassifier(n_neighbors=7)
#train knn
knn_model.fit(x_train, y_train)
#make predictions
knn_y_pred = knn_model.predict(x_test)


As part of k-nn, we could also perform a cross validation in order to choose the most optimal value of k

In [None]:
#k values from which we are going to select the most accurate one
neighbor_values = range(5, 29)

#empty list to store the average cross-validated scores
cv_scores = []

#10-fold cross-validation
for k in neighbor_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

#find the value of n_neighbors that gives the highest mean accuracy
optimal_k = neighbor_values[cv_scores.index(max(cv_scores))]
print(f"The optimal number of neighbors is {optimal_k}")

**Evaluate the models**

Evaluating the decission tree

In [None]:
#predicting on the test set
dt_y_pred = dt_model.predict(x_test)

#performance metrics
dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_error_rate = 1 - dt_accuracy
dt_classification_report = classification_report(y_test, dt_y_pred)

print("Decision Tree Model Test Results:")
print(f"Accuracy: {dt_accuracy}")
print(f"Error Rate: {dt_error_rate}")
print("Classification Report:")
print(dt_classification_report)

In [None]:
#performing ROC on Decission Tree

y_test_binary = y_test.replace({'no': 0, 'yes': 1})

#probabilities for each instance in the test set of being in each class
dt_y_pred_proba = dt_model.predict_proba(x_test)[:, 1]

#ROC curve
fpr, tpr, _ = roc_curve(y_test_binary, dt_y_pred_proba)

#AUC
roc_auc = roc_auc_score(y_test_binary, dt_y_pred_proba)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='brown', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='green', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - TREE')
plt.legend(loc="lower right")
plt.show()

Evaluating the k-nn

In [None]:
#predicting on the test set
knn_y_pred = knn_model.predict(x_test)

#performance metrics
knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_error_rate = 1 - knn_accuracy
knn_classification_report = classification_report(y_test, knn_y_pred)

print("K-Nearest Neighbors Model Test Results:")
print(f"Accuracy: {knn_accuracy}")
print(f"Error Rate: {knn_error_rate}")
print("Classification Report:")
print(knn_classification_report)

In [None]:
#performing ROC on k-nn

#probability scores of the positive class ('yes'), the one we are most interested on predicting
knn_y_prob = knn_model.predict_proba(x_test)[:, 1]

#ROC curve
fpr, tpr, thresholds = roc_curve(y_test, knn_y_prob, pos_label='yes')

#AUC
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkblue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - KNN')
plt.legend(loc="lower right")
plt.show()