In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [9]:
random_state = 42
cv_number = 3

In [10]:
train = pd.read_csv('data/train_data_features_selected_upsampled.csv')
test = pd.read_csv('data/test_data_features_selected.csv')

train_y = train["Credit_Score"]
train_x = train.drop(["Credit_Score"], axis=1)

test_y = test["Credit_Score"]
test_x = test.drop(["Credit_Score"], axis=1)

In [11]:
test_x_without_nan = test_x.fillna(0)

standard_scaled_train_x = StandardScaler().fit_transform(train_x)
standard_scaled_test_x = StandardScaler().fit_transform(test_x_without_nan)

minmax_scaled_train_x = MinMaxScaler().fit_transform(train_x)
minmax_scaled_test_x = MinMaxScaler().fit_transform(test_x_without_nan)

In [12]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report

bagging = BaggingClassifier(n_estimators=100, random_state=random_state)

bagging.fit(standard_scaled_train_x, train_y)
predicted_y = bagging.predict(standard_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.50      0.66      0.57      3527
        Poor       0.41      0.05      0.08      5874
    Standard       0.59      0.81      0.68     10599

    accuracy                           0.56     20000
   macro avg       0.50      0.50      0.44     20000
weighted avg       0.52      0.56      0.49     20000


In [13]:
bagging.fit(minmax_scaled_train_x, train_y)
predicted_y = bagging.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.46      0.76      0.57      3527
        Poor       0.43      0.03      0.06      5874
    Standard       0.59      0.77      0.67     10599

    accuracy                           0.55     20000
   macro avg       0.49      0.52      0.43     20000
weighted avg       0.52      0.55      0.47     20000


In [14]:
random_forest = RandomForestClassifier(criterion='gini', max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=random_state)

random_forest.fit(standard_scaled_train_x, train_y)
predicted_y = random_forest.predict(standard_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.51      0.72      0.59      3527
        Poor       0.53      0.03      0.07      5874
    Standard       0.59      0.82      0.69     10599

    accuracy                           0.57     20000
   macro avg       0.54      0.52      0.45     20000
weighted avg       0.56      0.57      0.49     20000


In [15]:
random_forest.fit(minmax_scaled_train_x, train_y)
predicted_y = random_forest.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.44      0.83      0.58      3527
        Poor       0.54      0.01      0.02      5874
    Standard       0.60      0.75      0.67     10599

    accuracy                           0.55     20000
   macro avg       0.53      0.53      0.42     20000
weighted avg       0.55      0.55      0.46     20000


In [16]:
knn_model = KNeighborsClassifier(algorithm='auto', n_neighbors=2, p=1, weights='distance')

knn_model.fit(standard_scaled_train_x, train_y)
predicted_y = knn_model.predict(standard_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.50      0.49      0.49      3527
        Poor       0.62      0.47      0.53      5874
    Standard       0.65      0.74      0.70     10599

    accuracy                           0.62     20000
   macro avg       0.59      0.57      0.57     20000
weighted avg       0.62      0.62      0.61     20000


In [17]:
knn_model.fit(minmax_scaled_train_x, train_y)
predicted_y = knn_model.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.42      0.52      0.47      3527
        Poor       0.54      0.32      0.40      5874
    Standard       0.62      0.70      0.66     10599

    accuracy                           0.56     20000
   macro avg       0.53      0.52      0.51     20000
weighted avg       0.56      0.56      0.55     20000


In [18]:
d_tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, random_state=random_state, splitter='best')

d_tree_model.fit(minmax_scaled_train_x, train_y)
predicted_y = d_tree_model.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.29      0.65      0.40      3527
        Poor       0.55      0.16      0.25      5874
    Standard       0.58      0.56      0.57     10599

    accuracy                           0.46     20000
   macro avg       0.47      0.46      0.41     20000
weighted avg       0.52      0.46      0.45     20000


In [19]:
d_tree_model.fit(standard_scaled_train_x, train_y)
predicted_y = d_tree_model.predict(standard_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.40      0.46      0.43      3527
        Poor       0.34      0.13      0.19      5874
    Standard       0.56      0.73      0.64     10599

    accuracy                           0.51     20000
   macro avg       0.44      0.44      0.42     20000
weighted avg       0.47      0.51      0.47     20000


In [20]:
from sklearn.svm import SVC

svc = SVC(C=50, random_state=random_state)

svc.fit(standard_scaled_train_x, train_y)
predicted_y = svc.predict(standard_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.48      0.51      0.50      3527
        Poor       0.57      0.38      0.46      5874
    Standard       0.63      0.74      0.68     10599

    accuracy                           0.59     20000
   macro avg       0.56      0.54      0.55     20000
weighted avg       0.59      0.59      0.58     20000


In [21]:
svc.fit(minmax_scaled_train_x, train_y)
predicted_y = svc.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.44      0.77      0.56      3527
        Poor       0.56      0.24      0.34      5874
    Standard       0.63      0.67      0.65     10599

    accuracy                           0.56     20000
   macro avg       0.54      0.56      0.52     20000
weighted avg       0.58      0.56      0.54     20000


In [22]:
g_boosting = GradientBoostingClassifier(learning_rate=1.0, max_features='sqrt', random_state=random_state)

g_boosting.fit(standard_scaled_train_x, train_y)
predicted_y = g_boosting.predict(standard_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.45      0.74      0.56      3527
        Poor       0.60      0.21      0.31      5874
    Standard       0.62      0.70      0.66     10599

    accuracy                           0.57     20000
   macro avg       0.56      0.55      0.51     20000
weighted avg       0.58      0.57      0.54     20000


In [23]:
g_boosting.fit(minmax_scaled_train_x, train_y)
predicted_y = g_boosting.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.31      0.88      0.46      3527
        Poor       0.52      0.15      0.23      5874
    Standard       0.60      0.47      0.53     10599

    accuracy                           0.45     20000
   macro avg       0.48      0.50      0.41     20000
weighted avg       0.52      0.45      0.43     20000


In [24]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression(C=1, penalty='none', random_state=random_state)

logistic_regression.fit(minmax_scaled_train_x, train_y)
predicted_y = logistic_regression.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))



              precision    recall  f1-score   support

        Good       0.30      0.90      0.45      3527
        Poor       0.67      0.01      0.01      5874
    Standard       0.56      0.50      0.53     10599

    accuracy                           0.43     20000
   macro avg       0.51      0.47      0.33     20000
weighted avg       0.55      0.43      0.36     20000


In [25]:
logistic_regression = LogisticRegression(C=1, penalty='l2', random_state=random_state)

logistic_regression.fit(standard_scaled_train_x, train_y)
predicted_y = logistic_regression.predict(standard_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.47      0.78      0.58      3527
        Poor       0.58      0.34      0.43      5874
    Standard       0.66      0.67      0.67     10599

    accuracy                           0.59     20000
   macro avg       0.57      0.60      0.56     20000
weighted avg       0.61      0.59      0.58     20000


In [26]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = MultinomialNB(alpha=2.0, fit_prior=True)

nb_clf.fit(minmax_scaled_train_x, train_y)
predicted_y = nb_clf.predict(minmax_scaled_test_x)

print(classification_report(test_y, predicted_y))

              precision    recall  f1-score   support

        Good       0.42      0.86      0.56      3527
        Poor       0.58      0.50      0.54      5874
    Standard       0.70      0.51      0.59     10599

    accuracy                           0.57     20000
   macro avg       0.57      0.62      0.56     20000
weighted avg       0.62      0.57      0.57     20000
