In [86]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier


In [87]:
df = pd.read_csv("African_crises_dataset.csv")

In [88]:
df.head()

Unnamed: 0,country_number,country_code,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.05168,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis


In [89]:
df['inflation_crises'].unique()

array([0, 1])

In [90]:
df = df.drop(['country_number'], axis=1)

In [91]:
df = df.drop(['country_code'], axis=1)

In [92]:
le = LabelEncoder()

In [93]:
df['country'] = le.fit_transform(df['country'])

In [94]:
df['banking_crisis'] = le.fit_transform(df['banking_crisis'])

In [95]:
df['banking_crisis'].value_counts(normalize=True)

banking_crisis
1    0.911237
0    0.088763
Name: proportion, dtype: float64

In [96]:
X = df.drop(['banking_crisis'], axis=1)

In [97]:
y = df['banking_crisis'] 

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [99]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [100]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

In [101]:
rfc = RandomForestClassifier()

In [102]:
pipeline = Pipeline([
    ('ss', StandardScaler()),
    ('rfc', RandomForestClassifier())
])

In [103]:
cv_scores = cross_val_score(pipeline, X_train_smote, y_train_smote, cv=cv, n_jobs=5)

In [104]:
params = {
    'rfc__n_estimators': range(25, 100, 25),
    'rfc__max_depth': range(10, 50, 10)
}

In [105]:
cross_val = cross_val_score(pipeline, X_train_smote, y_train_smote, cv=cv, n_jobs=1)
print(cross_val)

[0.97411003 0.97411003 0.99350649 0.99350649 0.99025974]


In [106]:
model = GridSearchCV(
    pipeline,
    param_grid=params,
    cv=cv,
    n_jobs=1,
    verbose=1
)

In [107]:
model.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [108]:
model.score(X_train_smote, y_train_smote)

1.0

In [109]:
print(accuracy_score(y_test, model.predict(X_test)))

0.9764150943396226


In [110]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.83      0.86        18
           1       0.98      0.99      0.99       194

    accuracy                           0.98       212
   macro avg       0.93      0.91      0.92       212
weighted avg       0.98      0.98      0.98       212



In [111]:
pipeline1 = Pipeline([
    ('ss', StandardScaler()),
    ('rfc', DecisionTreeClassifier())
])

In [112]:
params1 = {
    'rfc__max_depth': range(10, 50, 10),
    'rfc__min_samples_split': [2, 5, 10],
    'rfc__min_samples_leaf': [1, 2, 5]
}

In [113]:
model1 = GridSearchCV(
    pipeline1,
    param_grid=params1,
    cv=cv,
    n_jobs=1,
    verbose=1
)

In [114]:
model1.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [115]:
model1.score(X_train_smote, y_train_smote)

0.9902723735408561

In [116]:
print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84        18
           1       0.99      0.98      0.98       194

    accuracy                           0.97       212
   macro avg       0.89      0.93      0.91       212
weighted avg       0.97      0.97      0.97       212



In [117]:
from xgboost import XGBClassifier

In [118]:
xgb = XGBRegressor()

In [119]:
cv_score2 = cross_val_score(xgb, X_train_smote, y_train_smote, cv=cv, n_jobs=5)

In [132]:
pipeline2 = Pipeline([
    ('ss', StandardScaler()),
    ('xgb', XGBClassifier())
])

In [133]:
params2 = {
    "xgb__n_estimators": range(10, 50, 10),
    'xgb__max_depth': range(10, 40, 10)
}

In [134]:
model2 = GridSearchCV(
    pipeline2,
    param_grid = params2,
    cv=cv,
    n_jobs=5,
    verbose=1
)

In [135]:
model2.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [138]:
model2.score(X_train, y_train)

0.9988193624557261

In [137]:
print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86        18
           1       0.99      0.98      0.99       194

    accuracy                           0.98       212
   macro avg       0.92      0.94      0.93       212
weighted avg       0.98      0.98      0.98       212

