In [None]:
!pip install ydata-profiling

In [4]:
import pandas as pd
import seaborn as sns
from ydata_profiling import ProfileReport

In [5]:
df = pd.read_csv('African_crises_dataset.csv')
df

Unnamed: 0,country_number,country_code,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.149140,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.051680,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054,70,ZWE,Zimbabwe,2009,1,354.800000,1,1,0.0,-7.670000,1,1,0,crisis
1055,70,ZWE,Zimbabwe,2010,0,378.200000,1,1,0.0,3.217000,1,0,0,no_crisis
1056,70,ZWE,Zimbabwe,2011,0,361.900000,1,1,0.0,4.920000,1,0,0,no_crisis
1057,70,ZWE,Zimbabwe,2012,0,361.900000,1,1,0.0,3.720000,1,0,0,no_crisis


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_number                   1059 non-null   int64  
 1   country_code                     1059 non-null   object 
 2   country                          1059 non-null   object 
 3   year                             1059 non-null   int64  
 4   systemic_crisis                  1059 non-null   int64  
 5   exch_usd                         1059 non-null   float64
 6   domestic_debt_in_default         1059 non-null   int64  
 7   sovereign_external_debt_default  1059 non-null   int64  
 8   gdp_weighted_default             1059 non-null   float64
 9   inflation_annual_cpi             1059 non-null   float64
 10  independence                     1059 non-null   int64  
 11  currency_crises                  1059 non-null   int64  
 12  inflation_crises    

In [7]:
df.isnull().sum()

country_number                     0
country_code                       0
country                            0
year                               0
systemic_crisis                    0
exch_usd                           0
domestic_debt_in_default           0
sovereign_external_debt_default    0
gdp_weighted_default               0
inflation_annual_cpi               0
independence                       0
currency_crises                    0
inflation_crises                   0
banking_crisis                     0
dtype: int64

In [8]:
df.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
country_number,1059.0,,,,35.613787,23.692402,1.0,15.0,38.0,56.0,70.0
country_code,1059.0,13.0,EGY,155.0,,,,,,,
country,1059.0,13.0,Egypt,155.0,,,,,,,
year,1059.0,,,,1967.767705,33.530632,1860.0,1951.0,1973.0,1994.0,2014.0
systemic_crisis,1059.0,,,,0.077432,0.267401,0.0,0.0,0.0,0.0,1.0
exch_usd,1059.0,,,,43.140831,111.47538,0.0,0.19535,0.8684,8.46275,744.306139
domestic_debt_in_default,1059.0,,,,0.03966,0.195251,0.0,0.0,0.0,0.0,1.0
sovereign_external_debt_default,1059.0,,,,0.152975,0.360133,0.0,0.0,0.0,0.0,1.0
gdp_weighted_default,1059.0,,,,0.006402,0.043572,0.0,0.0,0.0,0.0,0.4
inflation_annual_cpi,1059.0,,,,20848.892444,675727.429176,-28.502137,2.086162,5.76233,11.644048,21989695.22


In [9]:
df.duplicated().sum()

0

In [11]:
profile = ProfileReport(df, title='African Crises Report', explorative=True)

In [None]:
profile.to_file('african_crises_report.html')

In [13]:
# Select all categorical columns and count the occurrences of unique values in each
# Making easier to analyze the distribution of categories in the dataset.
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

# Apply value_counts() to each categorical column and display the results
value_counts = {col: df[col].value_counts() for col in categorical_columns}

# Display the value counts for each categorical column
for col, counts in value_counts.items():
    print(f"Value counts for column {col}:\n{counts}\n")


Value counts for column country_code:
country_code
EGY    155
ZAF    114
ZWE     90
DZA     85
AGO     77
MAR     75
TUN     75
ZMB     72
MUS     68
KEN     67
CIV     63
NGA     60
CAF     58
Name: count, dtype: int64

Value counts for column country:
country
Egypt                       155
South Africa                114
Zimbabwe                     90
Algeria                      85
Angola                       77
Morocco                      75
Tunisia                      75
Zambia                       72
Mauritius                    68
Kenya                        67
Ivory Coast                  63
Nigeria                      60
Central African Republic     58
Name: count, dtype: int64

Value counts for column banking_crisis:
banking_crisis
no_crisis    965
crisis        94
Name: count, dtype: int64



In [17]:
# Encoding categorical values

from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder for binary column 
le = LabelEncoder()

# Fit and transform the 'country' column
df['banking_crisis'] = le.fit_transform(df['banking_crisis'])

# Using freuency encoding for object type column with multiple unique values

# Calculate frequency of each country
frequency = df['country'].value_counts()

# Map frequency to each country
df['country_encoded'] = df['country'].map(frequency)



In [36]:
df.head()

Unnamed: 0,country_number,country_code,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis,country_encoded
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,0,85
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,1,85
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,1,85
3,1,DZA,Algeria,1873,0,0.05168,0,0,0.0,11.203897,0,0,0,1,85
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,1,85


In [20]:
l_df = df.drop(['country','country_code'], axis=1)

In [23]:
l_df.corr()['systemic_crisis'].apply(abs).sort_values()

gdp_weighted_default               0.005274
country_number                     0.010991
inflation_annual_cpi               0.106452
currency_crises                    0.112751
domestic_debt_in_default           0.122158
country_encoded                    0.122828
independence                       0.147083
inflation_crises                   0.172562
year                               0.197450
exch_usd                           0.202687
sovereign_external_debt_default    0.249850
banking_crisis                     0.853702
systemic_crisis                    1.000000
Name: systemic_crisis, dtype: float64

In [None]:
l_df.drop(['gdp_weighted_default','country_number'], axis=1, inplace=True)
      

In [28]:
l_df

Unnamed: 0,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis,country_encoded
0,1870,1,0.052264,0,0,3.441456,0,0,0,0,85
1,1871,0,0.052798,0,0,14.149140,0,0,0,1,85
2,1872,0,0.052274,0,0,-3.718593,0,0,0,1,85
3,1873,0,0.051680,0,0,11.203897,0,0,0,1,85
4,1874,0,0.051308,0,0,-3.848561,0,0,0,1,85
...,...,...,...,...,...,...,...,...,...,...,...
1054,2009,1,354.800000,1,1,-7.670000,1,1,0,0,90
1055,2010,0,378.200000,1,1,3.217000,1,0,0,1,90
1056,2011,0,361.900000,1,1,4.920000,1,0,0,1,90
1057,2012,0,361.900000,1,1,3.720000,1,0,0,1,90


In [30]:
features = l_df.drop(['systemic_crisis'], axis=1)
features

Unnamed: 0,year,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis,country_encoded
0,1870,0.052264,0,0,3.441456,0,0,0,0,85
1,1871,0.052798,0,0,14.149140,0,0,0,1,85
2,1872,0.052274,0,0,-3.718593,0,0,0,1,85
3,1873,0.051680,0,0,11.203897,0,0,0,1,85
4,1874,0.051308,0,0,-3.848561,0,0,0,1,85
...,...,...,...,...,...,...,...,...,...,...
1054,2009,354.800000,1,1,-7.670000,1,1,0,0,90
1055,2010,378.200000,1,1,3.217000,1,0,0,1,90
1056,2011,361.900000,1,1,4.920000,1,0,0,1,90
1057,2012,361.900000,1,1,3.720000,1,0,0,1,90


In [31]:
#Initializing Scaler for input features
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Fit and transform the data
scaler = StandardScaler()
s_feat = scaler.fit_transform(features)

In [None]:
pip install xgboost

In [35]:
#Import required libraries
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#Set a random seed for reproducibility
import numpy as np
np.random.seed(42)


In [37]:
#Loading dataset 
X = s_feat
y = df['systemic_crisis']  # Target column

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
#Initialize Classifiers
log_reg = LogisticRegression()
random_forest = RandomForestClassifier()
xgb = XGBClassifier()
svm = SVC(kernel='linear')  


In [40]:
#Implementing Cross-validation

# Define cross-validation folds
cv_folds = 5

# Logistic Regression
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=cv_folds, scoring='accuracy')

# Random Forest
rf_scores = cross_val_score(random_forest, X_train, y_train, cv=cv_folds, scoring='accuracy')

# XGBoost
xgb_scores = cross_val_score(xgb, X_train, y_train, cv=cv_folds, scoring='accuracy')

# Support Vector Machine
svm_scores = cross_val_score(svm, X_train, y_train, cv=cv_folds, scoring='accuracy')


In [41]:
print("Logistic Regression Accuracy:", log_reg_scores.mean())
print("Random Forest Accuracy:", rf_scores.mean())
print("XGBoost Accuracy:", xgb_scores.mean())
print("SVM Accuracy:", svm_scores.mean())


Logistic Regression Accuracy: 0.9716602854159415
Random Forest Accuracy: 0.9870170553428472
XGBoost Accuracy: 0.98820048729551
SVM Accuracy: 0.9740341106856943


In [42]:
# Using SVM 'rbf' kennel(used for non-linear boundary decision)
svm = SVC(kernel='rbf')
svm_rbf_scores = cross_val_score(svm, X_train, y_train, cv=cv_folds, scoring='accuracy')
print("SVM rbf Accuracy:", svm_scores.mean())

SVM rbf Accuracy: 0.9740341106856943


In [62]:
# Using RadomForest and XGBoost to build model

#XGBoost
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate XGBoost on the test set
y_test_pred_xgb = xgb_model.predict(X_test)
y_test_pred_proba_xgb = xgb_model.predict_proba(X_test)

accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
precision_xgb = precision_score(y_test, y_test_pred_xgb)
recall_xgb = recall_score(y_test, y_test_pred_xgb)
f1_xgb = f1_score(y_test, y_test_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_test_pred_proba_xgb[:, 1])
conf_matrix_xgb = confusion_matrix(y_test, y_test_pred_xgb)

print("XGBoost Classifier Metrics on Test Set:")
print(f"Accuracy: {accuracy_xgb}")
print(f"Precision: {precision_xgb}")
print(f"Recall: {recall_xgb}")
print(f"F1 Score: {f1_xgb}")
print(f"ROC-AUC Score: {roc_auc_xgb}")
print(f"Confusion Matrix:\n{conf_matrix_xgb}")




XGBoost Classifier Metrics on Test Set:
Accuracy: 0.9952830188679245
Precision: 1.0
Recall: 0.9411764705882353
F1 Score: 0.9696969696969697
ROC-AUC Score: 0.9990950226244344
Confusion Matrix:
[[195   0]
 [  1  16]]


In [57]:
#Random Forest 

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate Random Forest on the test set
y_test_pred_rf = rf_model.predict(X_test)
y_test_pred_pb_rf = rf_model.predict_proba(X_test)

accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
precision_rf = precision_score(y_test, y_test_pred_rf)
recall_rf = recall_score(y_test, y_test_pred_rf)
f1_rf = f1_score(y_test, y_test_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_test_pred_pb_rf[:, 1])
conf_matrix_rf = confusion_matrix(y_test, y_test_pred_rf)

print("\nRandom Forest Classifier Metrics on Test Set:")
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1 Score: {f1_rf}")
print(f"ROC-AUC Score: {roc_auc_rf}")
print(f"Confusion Matrix:\n{conf_matrix_rf}")


Random Forest Classifier Metrics on Test Set:
Accuracy: 0.9952830188679245
Precision: 1.0
Recall: 0.9411764705882353
F1 Score: 0.9696969696969697
ROC-AUC Score: 1.0
Confusion Matrix:
[[195   0]
 [  1  16]]
