In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
model_train_data = pd.read_csv('data/train.csv')
model_test_data = pd.read_csv('data/test.csv')

def convert_values(value):
    if isinstance(value, str) and 'Hund+' in value:
        return int(value.replace('Hund+', '')) * 100
    elif isinstance(value, str) and 'Thou+' in value:
        return int(value.replace('Thou+', '')) * 1000
    elif isinstance(value, str) and 'Lac+' in value:
        return int(value.replace('Lac+', '')) * 100000
    elif isinstance(value, str) and 'Crore+' in value:
        return int(value.replace('Crore+', '')) * 1000000
    else:
        return int(value)

model_train_data['Total Assets'] = model_train_data['Total Assets'].apply(convert_values)
model_train_data['Liabilities'] = model_train_data['Liabilities'].apply(convert_values)

model_test_data['Total Assets'] = model_test_data['Total Assets'].apply(convert_values)
model_test_data['Liabilities'] = model_test_data['Liabilities'].apply(convert_values)

model_train_data['Candidate'] = model_train_data['Candidate'].str.split().str[-1]
model_test_data['Candidate'] = model_test_data['Candidate'].str.split().str[-1]

In [3]:
print(model_train_data.columns)
print(model_train_data['Education'].unique())
print(model_train_data['Party'].unique())

Index(['ID', 'Candidate', 'Constituency ∇', 'Party', 'Criminal Case',
       'Total Assets', 'Liabilities', 'state', 'Education'],
      dtype='object')
['8th Pass' '12th Pass' 'Post Graduate' 'Graduate Professional' 'Graduate'
 '10th Pass' 'Others' 'Doctorate' 'Literate' '5th Pass']
['DMK' 'BJP' 'INC' 'AITC' 'AAP' 'SP' 'NPP' 'BJD' 'IND' 'SHS' 'RJD' 'YSRCP'
 'AIADMK' 'CPI(M)' 'NCP' 'TDP' 'NDPP' 'CPI' 'Sikkim Krantikari Morcha'
 'JD(U)' 'JMM' 'JD(S)' 'Tipra Motha Party']


In [4]:
label_encoder = LabelEncoder()

model_train_data['Party'] = label_encoder.fit_transform(model_train_data['Party'])
model_train_data['state'] = label_encoder.fit_transform(model_train_data['state'])
model_train_data['Candidate'] = label_encoder.fit_transform(model_train_data['Candidate'])

model_test_data['Party'] = label_encoder.fit_transform(model_test_data['Party'])
model_test_data['state'] = label_encoder.fit_transform(model_test_data['state'])
model_test_data['Candidate'] = label_encoder.fit_transform(model_test_data['Candidate'])

In [5]:
print(model_train_data['Education'].unique())
print(np.unique(model_train_data['Party']))
print(np.unique(model_train_data['state']))
print(np.unique(model_train_data['Candidate']))
print(np.unique(model_train_data['Total Assets']))


['8th Pass' '12th Pass' 'Post Graduate' 'Graduate Professional' 'Graduate'
 '10th Pass' 'Others' 'Doctorate' 'Literate' '5th Pass']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27]
[   0    1    2 ... 1213 1214 1215]
[         0      15000      18000      24000      30000      51000
      72000      73000     100000     200000     300000     400000
     500000     600000     800000     900000    1000000    1100000
    1200000    1300000    1400000    1500000    1600000    1700000
    1800000    1900000    2000000    2100000    2200000    2300000
    2400000    2500000    2600000    2700000    2800000    2900000
    3000000    3100000    3200000    3300000    3400000    3500000
    3600000    3700000    3800000    3900000    4000000    4100000
    4200000    4300000    4400000    4500000    4600000    4700000
    4900000    5000000    5100000    5200000    5300000    5400000
    

In [6]:
model_features = ['Criminal Case', 'Total Assets', 'Liabilities', 'Party', 'state', 'Candidate']

sub_X = model_test_data[model_features]
X = model_train_data[model_features]
y = model_train_data['Education']

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)

In [7]:
rf_classifer = RandomForestClassifier(n_estimators=10, random_state=0, max_depth=40, min_samples_split=10)
rf_classifer.fit(train_X, train_y)

In [8]:
# Make predictions on the test set
y_pred = rf_classifer.predict(test_X)

# Evaluate the model
print(classification_report(test_y, y_pred, zero_division=1))

                       precision    recall  f1-score   support

            10th Pass       0.13      0.05      0.07        63
            12th Pass       0.14      0.12      0.12        86
             5th Pass       1.00      0.00      0.00         1
             8th Pass       0.00      0.00      1.00        24
            Doctorate       0.00      0.00      1.00        14
             Graduate       0.28      0.40      0.33       136
Graduate Professional       0.23      0.20      0.21        86
             Literate       1.00      0.00      0.00         3
               Others       1.00      0.00      0.00         8
        Post Graduate       0.19      0.30      0.23        94

             accuracy                           0.22       515
            macro avg       0.40      0.11      0.30       515
         weighted avg       0.21      0.22      0.27       515



# Finding the best f1 score

In [9]:
# Define the parameter grid
param_grid = {
    'n_estimators': [10, 25, 50, 100, 125, 150, 175, 200],
    'max_depth': [None, 10, 15, 20, 25, 30, 35, 40, 50],
    'min_samples_split': [2, 5, 7, 10, 12, 15, 18, 20]
}

data = []
for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=0)
            rf.fit(train_X, train_y)
            y_pred_rf = rf.predict(test_X)
            report = classification_report(test_y, y_pred_rf, zero_division=1, output_dict=True)
            f1_score_weighted = report['weighted avg']['f1-score']
            f1_score_macro = report['macro avg']['f1-score']
            data.append([n_estimators, max_depth, min_samples_split, f1_score_weighted, f1_score_macro])

df_rf = pd.DataFrame(data, columns=['n_estimators', 'max_depth', 'min_samples_split', 'f1_score', 'macro avg'])
df_rf = df_rf.sort_values(by='f1_score', ascending=False)
print(df_rf)

     n_estimators  max_depth  min_samples_split  f1_score  macro avg
93             25       15.0                 15  0.373939   0.297342
82             25       10.0                  7  0.367765   0.293364
87             25       10.0                 20  0.364396   0.292739
84             25       10.0                 12  0.350128   0.283384
83             25       10.0                 10  0.350046   0.284337
..            ...        ...                ...       ...        ...
12             10       10.0                 12  0.222800   0.184075
163            50       15.0                 10  0.222421   0.185860
40             10       30.0                  2  0.215025   0.221025
104            25       25.0                  2  0.206498   0.215895
80             25       10.0                  2  0.190957   0.092107

[576 rows x 5 columns]


In [10]:
# Forming submission

sub_predictions = rf_classifer.predict(sub_X)
sub_df = pd.DataFrame(sub_predictions, columns=['Education'])
sub_df.insert(0, 'ID', range(0, len(sub_df)))
sub_df.to_csv('submission.csv', index=False)