pip install imblearn scikit-learn pandas openpyxl

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [2]:
data = pd.read_excel("Credit_card.xlsx")
data.head(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
5,6,50000,1,1,2,37,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
6,7,500000,1,1,2,29,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
7,8,100000,2,2,2,23,0,-1,-1,0,...,221,-159,567,380,601,0,581,1687,1542,0
8,9,140000,2,3,1,28,0,0,2,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
9,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0


In [3]:
train, test = train_test_split(data, test_size=0.3, shuffle=True)
train.shape, test.shape

((21000, 25), (9000, 25))

In [4]:
x_train = train.drop(["default payment next month"], axis=1)
y_train = train["default payment next month"]
x_test = test.drop(["default payment next month"], axis=1)
y_test = test["default payment next month"]

####  <span style='color:White'> Decision Tree without hyperparameter tuning </span>

In [5]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(x_train, y_train)

In [6]:
predictions = tree.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.8185555555555556
F1 measure: 0.46087817761637506


####  <span style='color:White'> Decision Tree with hyperparameter tuning </span>

In [7]:
estimator = DecisionTreeClassifier()

hyperparam_space = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [5, 7, 9],
}

grid = GridSearchCV(estimator, hyperparam_space, cv=5, scoring="roc_auc", n_jobs=-1)

grid.fit(x_train, y_train)
predictions = grid.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))
print("ROC AUC:", grid.best_score_)
print("Best hyperparameters:", grid.best_params_)

Accuracy: 0.817
F1 measure: 0.44489383215369055
ROC AUC: 0.7527023306424416
Best hyperparameters: {'criterion': 'log_loss', 'max_depth': 7}


####  <span style='color:White'> Random Forest without hyperparameter tuning </span>

In [8]:
forest = RandomForestClassifier(n_estimators=10, max_depth=5)
forest.fit(x_train, y_train)

In [9]:
predictions = forest.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.8074444444444444
F1 measure: 0.38828097423226265


####  <span style='color:White'> Random Forest with hyperparameter tuning </span>

In [10]:
estimator = RandomForestClassifier()

hyperparam_space = {
    "n_estimators": [10, 20, 40],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [5, 7, 9],
}

grid = GridSearchCV(estimator, hyperparam_space, cv=5, scoring="roc_auc", n_jobs=-1)

grid.fit(x_train, y_train)
predictions = grid.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))
print("ROC AUC:", grid.best_score_)
print("Best hyperparameters:", grid.best_params_)

Accuracy: 0.8162222222222222
F1 measure: 0.4591236102027469
ROC AUC: 0.7823911809185566
Best hyperparameters: {'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 40}


###  <span style='color:White'>Under-sampling</span>

In [11]:
rus = RandomUnderSampler()
x_train_res, y_train_res = rus.fit_resample(x_train, y_train)
x_train_res.shape, y_train_res.shape

((9280, 24), (9280,))

####  <span style='color:White'>Decision tree with under-sampling</span>

In [12]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(x_train_res, y_train_res)

In [13]:
predictions = tree.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.7733333333333333
F1 measure: 0.5128939828080229


####  <span style='color:White'>Random forest with under-sampling</span>

In [14]:
forest = RandomForestClassifier(n_estimators=10, max_depth=5)
forest.fit(x_train_res, y_train_res)

In [15]:
predictions = forest.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.7591111111111111
F1 measure: 0.5253940455341506


###  <span style='color:White'>Over-sampling</span>

In [16]:
ros = RandomOverSampler()
x_train_res, y_train_res = ros.fit_resample(x_train, y_train)
x_train_res.shape, y_train_res.shape

((32720, 24), (32720,))

####  <span style='color:White'>Decision tree with over-sampling</span>

In [17]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(x_train_res, y_train_res)

In [18]:
predictions = tree.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.776
F1 measure: 0.5123367198838896


####  <span style='color:White'>Random forest with over-sampling</span>

In [19]:
forest = RandomForestClassifier(n_estimators=10, max_depth=5)
forest.fit(x_train_res, y_train_res)

In [20]:
predictions = forest.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.7735555555555556
F1 measure: 0.5251630941286115


###  <span style='color:White'>SMOTE</span>

In [21]:
smote = SMOTE()
x_train_res, y_train_res = smote.fit_resample(x_train, y_train)
x_train_res.shape, y_train_res.shape

((32720, 24), (32720,))

####  <span style='color:White'>Decision tree with SMOTE</span>

In [22]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(x_train_res, y_train_res)

In [23]:
predictions = tree.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.757
F1 measure: 0.4766690595836324


####  <span style='color:White'>Random forest with SMOTE</span>

In [24]:
forest = RandomForestClassifier(n_estimators=10, max_depth=5)
forest.fit(x_train_res, y_train_res)

In [25]:
predictions = forest.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("F1 measure:", f1_score(y_test, predictions))

Accuracy: 0.7591111111111111
F1 measure: 0.5029802842732692
