In [226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import log_loss 
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, f1_score

In [227]:
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [228]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9618 entries, 0 to 9617
Data columns (total 39 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   ID                                                                9618 non-null   object 
 1   country                                                           9618 non-null   object 
 2   owner_age                                                         9618 non-null   float64
 3   attitude_stable_business_environment                              9616 non-null   object 
 4   attitude_worried_shutdown                                         9616 non-null   object 
 5   compliance_income_tax                                             9614 non-null   object 
 6   perception_insurance_doesnt_cover_losses                          9613 non-null   object 
 7   perception_cannot_afford_insuranc

In [247]:
from scipy.stats import ttest_ind

male_age = train_data[train_data['owner_sex'] == 'Male']['business_age_total_months']
female_age = train_data[train_data['owner_sex'] == 'Female']['business_age_total_months']

ttest_ind(male_age, female_age, equal_var=False)


Ttest_indResult(statistic=7.542304960284633, pvalue=5.2063898977973154e-14)

In [246]:
train_data.groupby('owner_sex')['business_age_total_months'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
owner_sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,4303.0,77.102487,84.873927,0.0,24.0,48.0,97.0,627.0
Male,3371.0,92.550875,92.189271,0.0,28.0,60.0,122.0,677.0


In [229]:
def add_business_age_feature(df):

    df = df.copy()

    # Convert years to months and add
    df['business_age_total_months'] = (
        df['business_age_years'].fillna(0) * 12 +
        df['business_age_months'].fillna(0)
    )

    # Optional: also create years version (clean float)
    df['business_age_total_years'] = df['business_age_total_months'] / 12

    # Drop original columns
    df.drop(
        columns=['business_age_years', 'business_age_months'],
        inplace=True
    )

    return df


In [230]:
train_data = add_business_age_feature(train_data)
test_data = add_business_age_feature(test_data)


#### currency conversion

In [231]:
import numpy as np

def add_insurance_perception_features(df):

    df = df.copy()

    # Mapping dictionaries
    important_map = {
        "Yes": 1,
        "No": -1
    }

    doesnt_cover_map = {
        "No": 1,
        "Yes": -1
    }

    cannot_afford_map = {
        "No": 1,
        "Yes": -1
    }

    dont_insure_map = {
        "No": 1,
        "Yes": -1
    }

    # Apply mappings
    df['ins_important_score'] = df['perception_insurance_important'].map(important_map)
    df['ins_trust_score'] = df['perception_insurance_doesnt_cover_losses'].map(doesnt_cover_map)
    df['ins_afford_score'] = df['perception_cannot_afford_insurance'].map(cannot_afford_map)
    df['ins_access_score'] = df['perception_insurance_companies_dont_insure_businesses_like_yours'].map(dont_insure_map)

    # Replace NaN (e.g., Don't know / Refused) with 0
    score_cols = [
        'ins_important_score',
        'ins_trust_score',
        'ins_afford_score',
        'ins_access_score'
    ]

    df[score_cols] = df[score_cols].fillna(0)

    # Create final perception index
    df['insurance_perception_index'] = df[score_cols].mean(axis=1)

    return df


In [232]:
train_data = add_insurance_perception_features(train_data)
test_data = add_insurance_perception_features(test_data)


In [233]:
cols_to_drop = [
    'perception_insurance_important',
    'perception_insurance_doesnt_cover_losses',
    'perception_cannot_afford_insurance',
    'perception_insurance_companies_dont_insure_businesses_like_yours'
]

train_data.drop(columns=cols_to_drop)
test_data.drop(columns=cols_to_drop)

Unnamed: 0,ID,country,owner_age,attitude_stable_business_environment,attitude_worried_shutdown,compliance_income_tax,personal_income,business_expenses,business_turnover,motor_vehicle_insurance,...,motivation_make_more_money,uses_friends_family_savings,uses_informal_lender,business_age_total_months,business_age_total_years,ins_important_score,ins_trust_score,ins_afford_score,ins_access_score,insurance_perception_index
0,ID_5EGLKX,zimbabwe,50.0,No,No,No,100.0,3600.0,7200.0,Never had,...,,,,188.0,15.666667,1.0,1.0,-1.0,1.0,0.50
1,ID_4AI7RE,lesotho,36.0,Yes,Yes,No,900.0,400.0,900.0,Never had,...,Yes,Used to have but don't have now,Used to have but don't have now,60.0,5.000000,0.0,-1.0,-1.0,0.0,-0.50
2,ID_V9OB3M,lesotho,25.0,Don’t know or N/A,No,No,5250.0,350.0,1000.0,Never had,...,No,Never had,Used to have but don't have now,120.0,10.000000,0.0,0.0,0.0,0.0,0.00
3,ID_6OI9DI,malawi,25.0,Don’t know or N/A,Yes,No,485000.0,10000.0,20000.0,,...,Yes,Have now,Never had,48.0,4.000000,1.0,1.0,-1.0,0.0,0.25
4,ID_H2TN8B,lesotho,47.0,No,Yes,No,97.0,500.0,2000.0,Never had,...,Yes,Used to have but don't have now,Used to have but don't have now,12.0,1.000000,0.0,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2400,ID_FX7XJZ,eswatini,29.0,Yes,Yes,No,600.0,1700.0,2000.0,Never had,...,,Never had,Never had,71.0,5.916667,1.0,1.0,-1.0,1.0,0.50
2401,ID_XAL1LX,malawi,20.0,Don’t know or N/A,Don’t know or N/A,No,30000.0,20000.0,25000.0,,...,Yes,,,4.0,0.333333,0.0,0.0,0.0,0.0,0.00
2402,ID_UHBP0F,zimbabwe,26.0,Yes,Yes,No,3888.0,,,Never had,...,,,,48.0,4.000000,1.0,-1.0,-1.0,1.0,0.00
2403,ID_GKIKR2,eswatini,63.0,No,No,Yes,3500.0,1700.0,1200.0,Never had,...,,Never had,Never had,148.0,12.333333,1.0,-1.0,-1.0,0.0,-0.25


In [234]:


train_data['profit_margin'] = (
    (train_data['business_turnover'] - train_data['business_expenses']) /
    (train_data['business_turnover'] + 1e-6)
)

test_data['profit_margin'] = (
    (test_data['business_turnover'] - test_data['business_expenses']) /
    (test_data['business_turnover'] + 1e-6)
)


In [235]:
# Define x and y 
X = train_data.drop(columns=['Target', 'ID'])
y = train_data['Target']



X_test = test_data.drop(columns=['ID'])

In [236]:
def change_object_to_cat(df):
  # changes objects columns to category and returns dataframe and list

  df = df.copy()
  list_str_obj_cols = df.columns[df.dtypes == "object"].tolist()
  for str_obj_col in list_str_obj_cols:
      df[str_obj_col] = df[str_obj_col].astype("category")

  return df,list_str_obj_cols
X, cat_list = change_object_to_cat(X)

X

Unnamed: 0,country,owner_age,attitude_stable_business_environment,attitude_worried_shutdown,compliance_income_tax,perception_insurance_doesnt_cover_losses,perception_cannot_afford_insurance,personal_income,business_expenses,business_turnover,...,uses_friends_family_savings,uses_informal_lender,business_age_total_months,business_age_total_years,ins_important_score,ins_trust_score,ins_afford_score,ins_access_score,insurance_perception_index,profit_margin
0,eswatini,63.0,Yes,No,No,No,Yes,3000.0,6000.0,7000.0,...,Never had,Never had,174.0,14.500000,1.0,1.0,-1.0,-1.0,0.0,0.142857
1,zimbabwe,39.0,No,Yes,Yes,No,Yes,,,,...,,,183.0,15.250000,1.0,1.0,-1.0,1.0,0.5,
2,malawi,34.0,Don’t know or N/A,No,No,Don't know,Yes,30000.0,6000.0,13000.0,...,,,60.0,5.000000,1.0,0.0,-1.0,0.0,0.0,0.538462
3,malawi,28.0,Yes,No,No,No,No,180000.0,60000.0,30000.0,...,Never had,Have now,12.0,1.000000,1.0,1.0,1.0,1.0,1.0,-1.000000
4,zimbabwe,43.0,Yes,No,No,Yes,Yes,50.0,2400.0,1800.0,...,,,36.0,3.000000,1.0,-1.0,-1.0,-1.0,-0.5,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9613,lesotho,31.0,Yes,No,No,Don't know,Don't know,2000.0,800.0,500.0,...,Have now,Never had,36.0,3.000000,0.0,0.0,0.0,0.0,0.0,-0.600000
9614,malawi,64.0,Don’t know or N/A,Don’t know or N/A,Yes,No,No,10000000.0,5000000.0,200000000.0,...,,,240.0,20.000000,1.0,1.0,1.0,1.0,1.0,0.975000
9615,zimbabwe,43.0,Yes,No,No,Yes,No,450.0,,,...,,,20.0,1.666667,1.0,-1.0,1.0,1.0,0.5,
9616,zimbabwe,38.0,Yes,Yes,No,No,Yes,2000.0,,,...,,,72.0,6.000000,-1.0,1.0,-1.0,1.0,0.0,


In [237]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9618 entries, 0 to 9617
Data columns (total 43 columns):
 #   Column                                                            Non-Null Count  Dtype   
---  ------                                                            --------------  -----   
 0   country                                                           9618 non-null   category
 1   owner_age                                                         9618 non-null   float64 
 2   attitude_stable_business_environment                              9616 non-null   category
 3   attitude_worried_shutdown                                         9616 non-null   category
 4   compliance_income_tax                                             9614 non-null   category
 5   perception_insurance_doesnt_cover_losses                          9613 non-null   category
 6   perception_cannot_afford_insurance                                9613 non-null   category
 7   personal_income         

In [238]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=train_fe[['country', 'Target']]
)

In [239]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Identify your columns
num_cols = X_train.select_dtypes(include=['float64']).columns
cat_cols = X_train.select_dtypes(include=['object','category']).columns

# 2. Define the Numerical Transformer
# Median is safer for financial data; Scaling is essential for most models
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 3. Define the Categorical Transformer
# handle_unknown='ignore' prevents the model from crashing if the test set 
# has a category the training set never saw.
# Updated for older Scikit-Learn versions
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Change made here
])

# 4. Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

# 5. Fit and Transform X_train
# We "fit" only on train to avoid data leakage
# Fit and transform
# Fit and transform
X_train_array = preprocessor.fit_transform(X_train)
X_val_array = preprocessor.transform(X_val)

# Manually build feature names
num_features = num_cols.tolist()

ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_features = ohe.get_feature_names_out(cat_cols)

feature_names = num_features + list(cat_features)

# Convert to DataFrame
X_train_processed = pd.DataFrame(
    X_train_array,
    columns=feature_names,
    index=X_train.index
)

X_val_processed = pd.DataFrame(
    X_val_array,
    columns=feature_names,
    index=X_val.index
)




print(f"Original shape: {X_train.shape}")
print(f"Processed shape: {X_train_processed.shape}")

Original shape: (7694, 43)
Processed shape: (7694, 161)


In [240]:
X_train_processed

Unnamed: 0,owner_age,personal_income,business_expenses,business_turnover,business_age_total_months,business_age_total_years,ins_important_score,ins_trust_score,ins_afford_score,ins_access_score,...,uses_friends_family_savings_Never had,uses_friends_family_savings_Used to have but don't have now,uses_friends_family_savings_Used to have but don’t have now,uses_informal_lender_Don't know,uses_informal_lender_Don’t know (Do not show),uses_informal_lender_Have now,uses_informal_lender_Missing,uses_informal_lender_Never had,uses_informal_lender_Used to have but don't have now,uses_informal_lender_Used to have but don’t have now
5111,-1.104384,-0.095297,-0.070392,-0.144878,-0.513816,-0.513816,-1.731746,0.962263,-0.711384,-1.572387,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3681,0.552050,-0.096729,-0.071794,-0.146329,-0.227531,-0.227531,-1.731746,0.962263,1.715300,1.079508,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5087,-1.405554,-0.096727,-0.071741,-0.146133,-0.667969,-0.667969,0.875701,-0.232838,-0.711384,-0.246439,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2608,0.928512,-0.096698,-0.071798,-0.146393,-0.535837,-0.535837,-0.428023,0.962263,-0.711384,-0.246439,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7554,-0.803214,-0.093142,-0.071072,-0.144813,-0.667969,-0.667969,0.875701,0.962263,1.715300,1.079508,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5158,0.928512,-0.095657,-0.059979,-0.124657,-0.260564,-0.260564,0.875701,-1.427939,-0.711384,-1.572387,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1863,-0.727922,-0.096626,-0.071807,-0.146389,-0.535837,-0.535837,-0.428023,-0.232838,0.501958,-0.246439,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7278,0.025003,-0.096662,-0.071670,-0.146120,-0.767067,-0.767067,0.875701,0.962263,-0.711384,1.079508,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5116,-0.953799,-0.095297,-0.071664,-0.146251,-0.535837,-0.535837,-0.428023,0.962263,-0.711384,-0.246439,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [241]:


# Correlation with turnover
corr_years = X_train_processed['business_age_total_years'].corr(X_train_processed['business_turnover'])


print("Correlation (years) vs turnover:", corr_years)


Correlation (years) vs turnover: 0.038319207866934095


In [242]:
from sklearn.ensemble import RandomForestClassifier



# 1. Update the Pipeline with Random Forest

rf_model_pipe = Pipeline(steps=[

    ('preprocessor', preprocessor),

    ('classifier', RandomForestClassifier(

        n_estimators=200,

        max_depth= 10,  # Limiting depth to prevent overfitting the small 'High' class

        class_weight='balanced_subsample',

        random_state=42,

        n_jobs=-1

    ))

])

In [243]:

# The pipeline will now find the strings 'savings_assets_index', etc.
rf_model_pipe.fit(X, y)

# 3. Evaluate using the same DataFrame
rf_preds = rf_model_pipe.predict(X)
rf_probs = rf_model_pipe.predict_proba(X)

print("--- Random Forest Classification Report ---")
print(classification_report(y, rf_preds))

--- Random Forest Classification Report ---
              precision    recall  f1-score   support

        High       0.82      0.94      0.88       470
         Low       0.94      0.89      0.92      6280
      Medium       0.78      0.85      0.81      2868

    accuracy                           0.88      9618
   macro avg       0.85      0.90      0.87      9618
weighted avg       0.89      0.88      0.88      9618



In [244]:
predictions = rf_model_pipe.predict(X_test)
predictions

array(['Low', 'Medium', 'Low', ..., 'Medium', 'Medium', 'Low'],
      dtype=object)

In [245]:
p = pd.DataFrame({
    "ID": test_data["ID"],
    "Target": predictions
})


p.head(20)

Unnamed: 0,ID,Target
0,ID_5EGLKX,Low
1,ID_4AI7RE,Medium
2,ID_V9OB3M,Low
3,ID_6OI9DI,Low
4,ID_H2TN8B,Low
5,ID_U8T7ZQ,Medium
6,ID_QQJ3A1,Low
7,ID_F5S4JD,Low
8,ID_CY2C11,Medium
9,ID_63XVFI,Low
