In [29]:
# Import relevant libraries
import pandas as pd
import warnings 
from ydata_profiling import ProfileReport
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

In [32]:
warnings.filterwarnings("ignore")

In [2]:
# Specify file path
file_path="./African_crises_dataset.csv"

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,country_number,country_code,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.05168,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_number                   1059 non-null   int64  
 1   country_code                     1059 non-null   object 
 2   country                          1059 non-null   object 
 3   year                             1059 non-null   int64  
 4   systemic_crisis                  1059 non-null   int64  
 5   exch_usd                         1059 non-null   float64
 6   domestic_debt_in_default         1059 non-null   int64  
 7   sovereign_external_debt_default  1059 non-null   int64  
 8   gdp_weighted_default             1059 non-null   float64
 9   inflation_annual_cpi             1059 non-null   float64
 10  independence                     1059 non-null   int64  
 11  currency_crises                  1059 non-null   int64  
 12  inflation_crises    

In [4]:
df.isnull().sum()

country_number                     0
country_code                       0
country                            0
year                               0
systemic_crisis                    0
exch_usd                           0
domestic_debt_in_default           0
sovereign_external_debt_default    0
gdp_weighted_default               0
inflation_annual_cpi               0
independence                       0
currency_crises                    0
inflation_crises                   0
banking_crisis                     0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,country_number,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises
count,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0
mean,35.613787,1967.767705,0.077432,43.140831,0.03966,0.152975,0.006402,20848.89,0.776204,0.1322,0.129367
std,23.692402,33.530632,0.267401,111.47538,0.195251,0.360133,0.043572,675727.4,0.416984,0.349847,0.335765
min,1.0,1860.0,0.0,0.0,0.0,0.0,0.0,-28.50214,0.0,0.0,0.0
25%,15.0,1951.0,0.0,0.19535,0.0,0.0,0.0,2.086162,1.0,0.0,0.0
50%,38.0,1973.0,0.0,0.8684,0.0,0.0,0.0,5.76233,1.0,0.0,0.0
75%,56.0,1994.0,0.0,8.46275,0.0,0.0,0.0,11.64405,1.0,0.0,0.0
max,70.0,2014.0,1.0,744.306139,1.0,1.0,0.4,21989700.0,1.0,2.0,1.0


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.columns

Index(['country_number', 'country_code', 'country', 'year', 'systemic_crisis',
       'exch_usd', 'domestic_debt_in_default',
       'sovereign_external_debt_default', 'gdp_weighted_default',
       'inflation_annual_cpi', 'independence', 'currency_crises',
       'inflation_crises', 'banking_crisis'],
      dtype='object')

In [None]:
profile_report = ProfileReport(df, title="African Crisis Data exploration", explorative=True)
profile_path = "African_Crisis.html"
profile_report.to_file(profile_path)
print(f"\n Profile report saved to: {profile_path}")

In [11]:
# Handling outliers

Q1 = df.quantile(0.25 , numeric_only= True)
Q3 = df.quantile(0.75 , numeric_only= True)
IQR = Q3 - Q1

# Define upper and lower bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_filtered, lower_bound = df.align(lower_bound, axis=1, copy=False)
df_filtered, upper_bound = df.align(upper_bound, axis=1, copy=False)

# Remove outliers
df_cleaned = df_filtered[~((df_filtered < lower_bound) | (df_filtered > upper_bound)).any(axis=1)]
df_cleaned

Unnamed: 0,banking_crisis,country,country_code,country_number,currency_crises,domestic_debt_in_default,exch_usd,gdp_weighted_default,independence,inflation_annual_cpi,inflation_crises,sovereign_external_debt_default,systemic_crisis,year
38,no_crisis,Algeria,DZA,1,0,0,4.937060e+00,0.0,1,6.599988,0,0,0,1968
39,no_crisis,Algeria,DZA,1,0,0,4.937060e+00,0.0,1,2.626632,0,0,0,1969
40,no_crisis,Algeria,DZA,1,0,0,4.937060e+00,0.0,1,3.656321,0,0,0,1970
41,no_crisis,Algeria,DZA,1,0,0,4.644000e+00,0.0,1,6.172816,0,0,0,1971
42,no_crisis,Algeria,DZA,1,0,0,4.556000e+00,0.0,1,4.734229,0,0,0,1972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,no_crisis,Zambia,ZMB,69,0,0,6.385600e+00,0.0,1,7.811000,0,0,0,2014
1026,no_crisis,Zimbabwe,ZWE,70,0,0,7.230000e-27,0.0,1,13.200000,0,0,0,1981
1030,no_crisis,Zimbabwe,ZWE,70,0,0,1.630000e-26,0.0,1,8.300000,0,0,0,1985
1031,no_crisis,Zimbabwe,ZWE,70,0,0,1.680000e-26,0.0,1,14.500000,0,0,0,1986


In [12]:
# Using one hot encoder to encode categorical values
df= pd.get_dummies(df, columns=['country', 'country_code', 'banking_crisis'], drop_first=True)
df.shape

(1059, 36)

In [13]:
# Splitting the data to features and target
target = df['systemic_crisis']
features = df.drop(columns='systemic_crisis')
X = features
y = target

X_train,X_test,y_train,y_test = train_test_split(X ,y ,test_size = 0.2,random_state = 42 ,stratify=y)

In [15]:
# Initialize and train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Making predictions using the model
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
# Measuring the accuracy of a model
cm = confusion_matrix(y_test,y_pred)
cm

array([[194,   2],
       [  3,  13]])

In [24]:
# checking the cuuracy of the model
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9764150943396226

In [33]:
# Using cross validation k-5 folds
cv_scores = cross_val_score(model,X,y, cv=10)
cv_scores

array([0.97169811, 0.93396226, 1.        , 0.93396226, 0.99056604,
       0.95283019, 0.99056604, 0.97169811, 1.        , 0.95238095])

In [None]:
# retraining the model using user
# Create new features or transform existing ones to better capture the underlying patterns in the data:
# Cluster common data points
# Normalize or standardize features to ensure they're on the same scale