In [1]:
import pandas as pd
import sklearn as sk
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.metrics import classification_report
import random
import time

random.seed(time.time())

In [2]:
data = pd.read_csv('./All_States_GE.csv')

  data = pd.read_csv('./All_States_GE.csv')


In [3]:
# subset df statename, party
subsets = ['State_Name', 'Turncoat', 'Party', 'No_Terms']
subsetDf = data[subsets]
# convert turncoat boolean to int
subsetDf = subsetDf.dropna(subset=subsets)
subsetDf['Turncoat'] = subsetDf['Turncoat'].astype(int)
subsetDf

Unnamed: 0,State_Name,Turncoat,Party,No_Terms
0,Andaman_&_Nicobar_Islands,0,INC,1.0
1,Andaman_&_Nicobar_Islands,0,BJP,0.0
2,Andaman_&_Nicobar_Islands,0,IND,0.0
3,Andaman_&_Nicobar_Islands,0,AAAP,0.0
4,Andaman_&_Nicobar_Islands,0,BSP,0.0
...,...,...,...,...
91664,Mysore,0,SWA,0.0
91665,Uttar_Pradesh,0,INC,1.0
91666,Uttar_Pradesh,0,IND,0.0
91667,Uttar_Pradesh,0,IND,0.0


In [4]:
subsetDf2 = subsetDf.copy()
subsetDf2


Unnamed: 0,State_Name,Turncoat,Party,No_Terms
0,Andaman_&_Nicobar_Islands,0,INC,1.0
1,Andaman_&_Nicobar_Islands,0,BJP,0.0
2,Andaman_&_Nicobar_Islands,0,IND,0.0
3,Andaman_&_Nicobar_Islands,0,AAAP,0.0
4,Andaman_&_Nicobar_Islands,0,BSP,0.0
...,...,...,...,...
91664,Mysore,0,SWA,0.0
91665,Uttar_Pradesh,0,INC,1.0
91666,Uttar_Pradesh,0,IND,0.0
91667,Uttar_Pradesh,0,IND,0.0


In [5]:
tobeEncoded = [x for x in subsetDf2.columns if subsetDf2[x].dtype == 'object']
tobeEncoded

['State_Name', 'Party']

In [6]:
encoder = preprocessing.LabelEncoder()
for x in tobeEncoded:
    subsetDf2[x] = encoder.fit_transform(subsetDf2[x])

In [10]:
# find correlation between turncoat and no_terms
corrDF = pd.DataFrame()
corrDF['Turncoat'] = subsetDf2['Turncoat']
corrDF['No_Terms'] = subsetDf2['No_Terms']

corrDF['No_Terms'].corr(corrDF['Turncoat'])

0.21068799798532806

In [615]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB())
]

maxLength = 0
for name, model in models:
    maxLength = max(maxLength, len(name))

ITERATIONS = 100

for name, model in models:
    # print(name)
    spaces = ' ' * (maxLength - len(name))
    values = []
    bar = tqdm(range(ITERATIONS))
    avgAcc = 0
    total = 0
    predictedVals = []
    actualVals = []
    for i in bar:
        bar.set_description(f"{name}{spaces} | {avgAcc/(1 if total == 0 else total):.3f}")
        subsetDf3 = subsetDf2.groupby('Turncoat').apply(lambda x: x.sample(subsetDf2['Turncoat'].value_counts().min()).reset_index(drop=True))

        X = subsetDf3.drop('Turncoat', axis=1)
        Y = subsetDf3['Turncoat']

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=random.randint(0, 1000))

        model.fit(X_train, Y_train)

        predictions = model.predict(X_test)
        accuracy = accuracy_score(Y_test, predictions)
        predictedVals.extend(predictions)
        actualVals.extend(Y_test)
        values.append(accuracy)
        avgAcc += accuracy
        total += 1

    print(classification_report(actualVals, predictedVals))



Logistic Regression | 0.751: 100%|██████████| 100/100 [00:02<00:00, 44.95it/s]


              precision    recall  f1-score   support

           0       0.70      0.88      0.78     57771
           1       0.84      0.62      0.71     57729

    accuracy                           0.75    115500
   macro avg       0.77      0.75      0.75    115500
weighted avg       0.77      0.75      0.75    115500



Decision Tree       | 0.791: 100%|██████████| 100/100 [00:00<00:00, 100.58it/s]


              precision    recall  f1-score   support

           0       0.80      0.78      0.79     57887
           1       0.79      0.80      0.79     57613

    accuracy                           0.79    115500
   macro avg       0.79      0.79      0.79    115500
weighted avg       0.79      0.79      0.79    115500



Random Forest       | 0.797: 100%|██████████| 100/100 [00:21<00:00,  4.67it/s]


              precision    recall  f1-score   support

           0       0.82      0.76      0.79     57816
           1       0.78      0.84      0.80     57684

    accuracy                           0.80    115500
   macro avg       0.80      0.80      0.80    115500
weighted avg       0.80      0.80      0.80    115500



Gradient Boosting   | 0.804: 100%|██████████| 100/100 [00:14<00:00,  6.93it/s]


              precision    recall  f1-score   support

           0       0.85      0.73      0.79     57499
           1       0.77      0.88      0.82     58001

    accuracy                           0.80    115500
   macro avg       0.81      0.80      0.80    115500
weighted avg       0.81      0.80      0.80    115500



SVM                 | 0.634: 100%|██████████| 100/100 [00:50<00:00,  1.98it/s]


              precision    recall  f1-score   support

           0       0.60      0.77      0.68     57709
           1       0.68      0.50      0.58     57791

    accuracy                           0.63    115500
   macro avg       0.64      0.63      0.63    115500
weighted avg       0.64      0.63      0.63    115500



KNN                 | 0.787: 100%|██████████| 100/100 [00:02<00:00, 37.50it/s]


              precision    recall  f1-score   support

           0       0.82      0.74      0.78     57602
           1       0.76      0.84      0.80     57898

    accuracy                           0.79    115500
   macro avg       0.79      0.79      0.79    115500
weighted avg       0.79      0.79      0.79    115500



Naive Bayes         | 0.667: 100%|██████████| 100/100 [00:00<00:00, 140.89it/s]


              precision    recall  f1-score   support

           0       0.62      0.88      0.72     57663
           1       0.79      0.46      0.58     57837

    accuracy                           0.67    115500
   macro avg       0.70      0.67      0.65    115500
weighted avg       0.70      0.67      0.65    115500

