In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test1 = pd.read_csv('test.csv')

In [3]:
df.drop(['ID','Candidate','Constituency ∇'],axis='columns',inplace=True)
df_test.drop(['ID','Candidate','Constituency ∇'],axis='columns',inplace=True)
df.nunique()

Party             23
Criminal Case     35
Total Assets     210
Liabilities      170
state             28
Education         10
dtype: int64

In [4]:
def preprocess_amount(amount_str):
    if 'Crore' in amount_str:
        cleaned_str = ''.join(filter(str.isdigit, amount_str))
        amount_numeric = int(cleaned_str) * 10000000
    elif 'Lac' in amount_str:
        cleaned_str = ''.join(filter(str.isdigit, amount_str))
        amount_numeric = int(cleaned_str) * 100000
    elif 'Thou' in amount_str:
        cleaned_str = ''.join(filter(str.isdigit, amount_str))
        amount_numeric = int(cleaned_str) * 1000 
    elif 'Hund' in amount_str:
        cleaned_str = ''.join(filter(str.isdigit, amount_str))
        amount_numeric = int(cleaned_str) * 100
    else:
        amount_numeric = amount_str
    return int(amount_numeric)

df['Total Assets'] = df['Total Assets'].apply(preprocess_amount)
df['Liabilities'] = df['Liabilities'].apply(preprocess_amount)
df_test['Total Assets'] = df_test['Total Assets'].apply(preprocess_amount)
df_test['Liabilities'] = df_test['Liabilities'].apply(preprocess_amount)

In [5]:
columns_too_dummy = ['Party','state']
d = df.drop(['Education'],axis = 'columns')
d = pd.get_dummies(d,columns = columns_too_dummy, drop_first=True)
df_test = pd.get_dummies(df_test,columns = columns_too_dummy, drop_first=True)
X_train = d
y_train = df['Education']
X_train.describe()

Unnamed: 0,Criminal Case,Total Assets,Liabilities
count,2059.0,2059.0,2059.0
mean,1.777562,115599100.0,21590680.0
std,4.762183,492246500.0,204861200.0
min,0.0,0.0,0.0
25%,0.0,9500000.0,48500.0
50%,0.0,20000000.0,1800000.0
75%,2.0,80000000.0,8900000.0
max,87.0,12670000000.0,8810000000.0


In [6]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

X_train[['Total Assets', 'Liabilities', 'Criminal Case']] = scaler.fit_transform(X_train[['Total Assets', 'Liabilities', 'Criminal Case']])
df_test[['Total Assets', 'Liabilities', 'Criminal Case']] = scaler.transform(df_test[['Total Assets', 'Liabilities', 'Criminal Case']])


In [7]:
# Calculate mean and standard deviation for each column
mean_cc = X_train['Criminal Case'].mean()
std_cc = X_train['Criminal Case'].std()

mean_assets = X_train['Total Assets'].mean()
std_assets = X_train['Total Assets'].std()

mean_liabilities = X_train['Liabilities'].mean()
std_liabilities = X_train['Liabilities'].std()

# Binarize each column into four new columns based on value ranges
X_train['Criminal Case < mean-std'] = np.where(X_train['Criminal Case'] < (mean_cc - std_cc), 1, 0)
X_train['Criminal Case mean-std to mean'] = np.where((X_train['Criminal Case'] >= (mean_cc - std_cc)) & (X_train['Criminal Case'] < mean_cc), 1, 0)
X_train['Criminal Case mean to mean+std'] = np.where((X_train['Criminal Case'] >= mean_cc) & (X_train['Criminal Case'] < (mean_cc + std_cc)), 1, 0)
X_train['Criminal Case > mean+std'] = np.where(X_train['Criminal Case'] >= (mean_cc + std_cc), 1, 0)

X_train['Total Assets < mean-std'] = np.where(X_train['Total Assets'] < (mean_assets - std_assets), 1, 0)
X_train['Total Assets mean-std to mean'] = np.where((X_train['Total Assets'] >= (mean_assets - std_assets)) & (X_train['Total Assets'] < mean_assets), 1, 0)
X_train['Total Assets mean to mean+std'] = np.where((X_train['Total Assets'] >= mean_assets) & (X_train['Total Assets'] < (mean_assets + std_assets)), 1, 0)
X_train['Total Assets > mean+std'] = np.where(X_train['Total Assets'] >= (mean_assets + std_assets), 1, 0)

X_train['Liabilities < mean-std'] = np.where(X_train['Liabilities'] < (mean_liabilities - std_liabilities), 1, 0)
X_train['Liabilities mean-std to mean'] = np.where((X_train['Liabilities'] >= (mean_liabilities - std_liabilities)) & (X_train['Liabilities'] < mean_liabilities), 1, 0)
X_train['Liabilities mean to mean+std'] = np.where((X_train['Liabilities'] >= mean_liabilities) & (X_train['Liabilities'] < (mean_liabilities + std_liabilities)), 1, 0)
X_train['Liabilities > mean+std'] = np.where(X_train['Liabilities'] >= (mean_liabilities + std_liabilities), 1, 0)

# Drop the original columns
X_train.drop(['Criminal Case', 'Total Assets', 'Liabilities'], axis=1, inplace=True)

In [8]:
# Calculate mean and standard deviation for each column
mean_cc = df_test['Criminal Case'].mean()
std_cc = df_test['Criminal Case'].std()

mean_assets = df_test['Total Assets'].mean()
std_assets = df_test['Total Assets'].std()

mean_liabilities = df_test['Liabilities'].mean()
std_liabilities = df_test['Liabilities'].std()

# Binarize each column into four new columns based on value ranges
df_test['Criminal Case < mean-std'] = np.where(df_test['Criminal Case'] < (mean_cc - std_cc), 1, 0)
df_test['Criminal Case mean-std to mean'] = np.where((df_test['Criminal Case'] >= (mean_cc - std_cc)) & (df_test['Criminal Case'] < mean_cc), 1, 0)
df_test['Criminal Case mean to mean+std'] = np.where((df_test['Criminal Case'] >= mean_cc) & (df_test['Criminal Case'] < (mean_cc + std_cc)), 1, 0)
df_test['Criminal Case > mean+std'] = np.where(df_test['Criminal Case'] >= (mean_cc + std_cc), 1, 0)

df_test['Total Assets < mean-std'] = np.where(df_test['Total Assets'] < (mean_assets - std_assets), 1, 0)
df_test['Total Assets mean-std to mean'] = np.where((df_test['Total Assets'] >= (mean_assets - std_assets)) & (df_test['Total Assets'] < mean_assets), 1, 0)
df_test['Total Assets mean to mean+std'] = np.where((df_test['Total Assets'] >= mean_assets) & (df_test['Total Assets'] < (mean_assets + std_assets)), 1, 0)
df_test['Total Assets > mean+std'] = np.where(df_test['Total Assets'] >= (mean_assets + std_assets), 1, 0)

df_test['Liabilities < mean-std'] = np.where(df_test['Liabilities'] < (mean_liabilities - std_liabilities), 1, 0)
df_test['Liabilities mean-std to mean'] = np.where((df_test['Liabilities'] >= (mean_liabilities - std_liabilities)) & (df_test['Liabilities'] < mean_liabilities), 1, 0)
df_test['Liabilities mean to mean+std'] = np.where((df_test['Liabilities'] >= mean_liabilities) & (df_test['Liabilities'] < (mean_liabilities + std_liabilities)), 1, 0)
df_test['Liabilities > mean+std'] = np.where(df_test['Liabilities'] >= (mean_liabilities + std_liabilities), 1, 0)

# Drop the original columns
df_test.drop(['Criminal Case', 'Total Assets', 'Liabilities'], axis=1, inplace=True)

In [9]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0],
    'binarize': [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    'fit_prior': [True, False],
    'class_prior': [None, [0.3, 0.7]]
}

bnb = BernoulliNB()
grid_search = GridSearchCV(bnb, param_grid, cv=15, scoring='f1_weighted')
grid_search.fit(X_train, y_train)

grid_search.best_params_

1080 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\vlskris\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\vlskris\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vlskris\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\naive_bayes.py", line 762, in fit
    self._update_class_log_pri

{'alpha': 5.0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': False}

In [10]:
# Create DataFrame for 'ID' column
df_id = pd.DataFrame({'ID': df_test1['ID']})

# Create DataFrame for 'Education' column with predictions
df_education = pd.DataFrame({'Education':grid_search.predict(df_test) })

# Concatenate 'ID' and 'Education' DataFrames
df_result = pd.concat([df_id, df_education], axis=1)
df_result.to_csv('output_final_kaggle.csv', index=False)