In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import (
                                BaggingClassifier,
                                ExtraTreesClassifier,
                                RandomForestClassifier,
                                StackingClassifier,
                                HistGradientBoostingClassifier
                            )
from xgboost import XGBClassifier

import pandas as pd
import warnings

warnings.filterwarnings("ignore")

df = pd.read_csv("v2_train.csv")



# Choose 100 sample different credit score.
score1 = df[df["Credit_Score"] == 1].sample(n=100, random_state=42)
score2 = df[df["Credit_Score"] == 2].sample(n=100, random_state=42)
score3 = df[df["Credit_Score"] == 3].sample(n=100, random_state=42)

# Concat score 1 2 3 df
data = pd.concat([score1, score2, score3], ignore_index=False)

# Delete the selected samples from the original dataframe
df_train = df.drop(data.index,axis=0).reset_index(drop=True)

data['Credit_Score'].tail(100)



30668    3
29935    3
80909    3
66605    3
58347    3
        ..
95010    3
18262    3
4786     3
91285    3
41786    3
Name: Credit_Score, Length: 100, dtype: int64

In [6]:
df.columns

Index(['Age', 'Occupation', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance', 'Credit_Score'],
      dtype='object')

### Apply oversampling


In [7]:
# define dataset
X, y = df.drop("Credit_Score",axis=1).values , df["Credit_Score"] 

# y.value_counts(normalize=True)
# rus = SMOTE(sampling_strategy='auto')
# X_data_rus, y_data_rus = rus.fit_resample(X, y)
# y_data_rus.value_counts(normalize=True)

### Split Dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Handling Numerical

In [9]:
# scalar = PowerTransformer(method='yeo-johnson', standardize=True).fit(X_train)
# X_train = scalar.transform(X_train)
# X_test = scalar.transform(X_test)

## Modeling and Evaluation

Model Building

In [10]:
bagging = BaggingClassifier()
extraTrees = ExtraTreesClassifier(max_depth=10)
randomForest = RandomForestClassifier(random_state=42)
histGradientBoosting = HistGradientBoostingClassifier()
XGB = XGBClassifier()

model = StackingClassifier(
    estimators=[
    ('bagging', bagging),
    ('extraTress', extraTrees),
    ('randomforest', randomForest),
    ('histGradientBoosting', histGradientBoosting),
    ('XGB', XGB)
], n_jobs=-1)

In [11]:
model.fit(X_train, y_train)

In [12]:
print("Test Score:",model.score(X_test, y_test))

Test Score: 0.79005


In [13]:
# import pickle
# with open('model.pkl', 'wb') as f:
#     pickle.dump(model, f)

In [14]:
data = data.drop('Credit_Score', axis=1)


predictions = model.predict(data[:100])

# Print the predictions
print(predictions)

predictions = model.predict(data[100:200])

# Print the predictions
print(predictions)

predictions = model.predict(data[200:300])

# Print the predictions
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2]
[3 3 3 2 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 2 3 3 3 3 3 3 3
 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 2 3 3 2 2 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
