In [21]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [8]:
model_train_data = pd.read_csv('data/train.csv')
model_test_data = pd.read_csv('data/test.csv')

def convert_values(value):
    if isinstance(value, str) and 'Hund+' in value:
        return int(value.replace('Hund+', '')) * 100
    elif isinstance(value, str) and 'Thou+' in value:
        return int(value.replace('Thou+', '')) * 1000
    elif isinstance(value, str) and 'Lac+' in value:
        return int(value.replace('Lac+', '')) * 100000
    elif isinstance(value, str) and 'Crore+' in value:
        return int(value.replace('Crore+', '')) * 1000000
    else:
        return int(value)

model_train_data['Total Assets'] = model_train_data['Total Assets'].apply(convert_values)
model_train_data['Liabilities'] = model_train_data['Liabilities'].apply(convert_values)

model_test_data['Total Assets'] = model_test_data['Total Assets'].apply(convert_values)
model_test_data['Liabilities'] = model_test_data['Liabilities'].apply(convert_values)

model_train_data['Candidate'] = model_train_data['Candidate'].str.split().str[-1]
model_test_data['Candidate'] = model_test_data['Candidate'].str.split().str[-1]

In [9]:
print(model_train_data.columns)
print(model_train_data['Education'].unique())
print(model_train_data['Party'].unique())

Index(['ID', 'Candidate', 'Constituency ∇', 'Party', 'Criminal Case',
       'Total Assets', 'Liabilities', 'state', 'Education'],
      dtype='object')
['8th Pass' '12th Pass' 'Post Graduate' 'Graduate Professional' 'Graduate'
 '10th Pass' 'Others' 'Doctorate' 'Literate' '5th Pass']
['DMK' 'BJP' 'INC' 'AITC' 'AAP' 'SP' 'NPP' 'BJD' 'IND' 'SHS' 'RJD' 'YSRCP'
 'AIADMK' 'CPI(M)' 'NCP' 'TDP' 'NDPP' 'CPI' 'Sikkim Krantikari Morcha'
 'JD(U)' 'JMM' 'JD(S)' 'Tipra Motha Party']


In [10]:
label_encoder = LabelEncoder()

model_train_data['Party'] = label_encoder.fit_transform(model_train_data['Party'])
model_train_data['state'] = label_encoder.fit_transform(model_train_data['state'])
model_train_data['Candidate'] = label_encoder.fit_transform(model_train_data['Candidate'])

model_test_data['Party'] = label_encoder.fit_transform(model_test_data['Party'])
model_test_data['state'] = label_encoder.fit_transform(model_test_data['state'])
model_test_data['Candidate'] = label_encoder.fit_transform(model_test_data['Candidate'])

In [26]:
print(model_train_data['Education'].unique())
print(np.unique(model_train_data['Party']))
print(np.unique(model_train_data['state']))
print(np.unique(model_train_data['Candidate']))
print(np.unique(model_train_data['Total Assets']))


['8th Pass' '12th Pass' 'Post Graduate' 'Graduate Professional' 'Graduate'
 '10th Pass' 'Others' 'Doctorate' 'Literate' '5th Pass']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27]
[   0    1    2 ... 1213 1214 1215]
[         0      15000      18000      24000      30000      51000
      72000      73000     100000     200000     300000     400000
     500000     600000     800000     900000    1000000    1100000
    1200000    1300000    1400000    1500000    1600000    1700000
    1800000    1900000    2000000    2100000    2200000    2300000
    2400000    2500000    2600000    2700000    2800000    2900000
    3000000    3100000    3200000    3300000    3400000    3500000
    3600000    3700000    3800000    3900000    4000000    4100000
    4200000    4300000    4400000    4500000    4600000    4700000
    4900000    5000000    5100000    5200000    5300000    5400000
    

In [11]:
model_features = ['Criminal Case', 'Total Assets', 'Liabilities', 'Party', 'state', 'Candidate']

sub_X = model_test_data[model_features]
X = model_train_data[model_features]
y = model_train_data['Education']

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)

In [12]:
rf_classifer = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=25)
rf_classifer.fit(train_X, train_y)

In [16]:
# Make predictions on the test set
y_pred = rf_classifer.predict(test_X)

# Evaluate the model
print(classification_report(test_y, y_pred, zero_division=1))

                       precision    recall  f1-score   support

            10th Pass       0.13      0.06      0.09        63
            12th Pass       0.20      0.15      0.17        86
             5th Pass       1.00      0.00      0.00         1
             8th Pass       0.25      0.04      0.07        24
            Doctorate       0.00      0.00      1.00        14
             Graduate       0.29      0.43      0.35       136
Graduate Professional       0.22      0.19      0.20        86
             Literate       0.00      0.00      1.00         3
               Others       1.00      0.00      0.00         8
        Post Graduate       0.19      0.28      0.22        94

             accuracy                           0.23       515
            macro avg       0.33      0.11      0.31       515
         weighted avg       0.23      0.23      0.24       515



In [14]:
# # Define the parameter grid
# param_grid = {
#     'n_estimators': [10, 25, 50, 100, 150, 200],
#     'max_depth': [None, 10, 20, 30, 40],
#     'min_samples_split': [2, 5, 10, 15]
# }

# # Initialize the RandomForestClassifier
# rf = RandomForestClassifier(random_state=0)

# # Initialize the GridSearchCV
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

# # Fit the GridSearchCV
# grid_search.fit(train_X, train_y)

# # Get the best parameters
# best_params = grid_search.best_params_

# # Train the model using the best parameters
# rf_best = RandomForestClassifier(**best_params, random_state=0)
# rf_best.fit(train_X, train_y)

# # Now you can use rf_best to make predictions
# print(classification_report(test_y,  rf_best.predict(test_X), zero_division=1))

KeyboardInterrupt: 

In [15]:
# Forming submission

sub_predictions = rf_classifer.predict(sub_X)
sub_df = pd.DataFrame(sub_predictions, columns=['Education'])
sub_df.insert(0, 'ID', range(0, len(sub_df)))
sub_df.to_csv('submission.csv', index=False)