In [587]:
import pandas as pd
import sklearn as sk
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.metrics import classification_report
import random

In [588]:
data = pd.read_csv('./All_States_GE.csv')

  data = pd.read_csv('./All_States_GE.csv')


In [589]:
# subset df statename, party
subsets = ['State_Name', 'Turncoat', 'Party', 'No_Terms']
subsetDf = data[subsets]
# convert turncoat boolean to int
subsetDf = subsetDf.dropna(subset=subsets)
subsetDf['Turncoat'] = subsetDf['Turncoat'].astype(int)
subsetDf

Unnamed: 0,State_Name,Turncoat,Party,No_Terms,TCPD_Prof_Main
0,Andaman_&_Nicobar_Islands,0,INC,1.0,Business
1,Andaman_&_Nicobar_Islands,0,BJP,0.0,Liberal Profession or Professional
2,Andaman_&_Nicobar_Islands,0,IND,0.0,Agriculture
3,Andaman_&_Nicobar_Islands,0,AAAP,0.0,Business
4,Andaman_&_Nicobar_Islands,0,BSP,0.0,Social Work
...,...,...,...,...,...
31544,West_Bengal,0,INC,0.0,Other
31545,West_Bengal,0,BJP,0.0,Other
31546,West_Bengal,0,IND,0.0,Other
31547,West_Bengal,0,JD(S),0.0,Other


In [590]:
subsetDf2 = subsetDf.copy()
subsetDf2


Unnamed: 0,State_Name,Turncoat,Party,No_Terms,TCPD_Prof_Main
0,Andaman_&_Nicobar_Islands,0,INC,1.0,Business
1,Andaman_&_Nicobar_Islands,0,BJP,0.0,Liberal Profession or Professional
2,Andaman_&_Nicobar_Islands,0,IND,0.0,Agriculture
3,Andaman_&_Nicobar_Islands,0,AAAP,0.0,Business
4,Andaman_&_Nicobar_Islands,0,BSP,0.0,Social Work
...,...,...,...,...,...
31544,West_Bengal,0,INC,0.0,Other
31545,West_Bengal,0,BJP,0.0,Other
31546,West_Bengal,0,IND,0.0,Other
31547,West_Bengal,0,JD(S),0.0,Other


In [591]:
tobeEncoded = [x for x in subsetDf2.columns if subsetDf2[x].dtype == 'object']
tobeEncoded

['State_Name', 'Party', 'TCPD_Prof_Main']

In [592]:
encoder = preprocessing.LabelEncoder()
for x in tobeEncoded:
    subsetDf2[x] = encoder.fit_transform(subsetDf2[x])

In [593]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB())
]

maxLength = 0
for name, model in models:
    maxLength = max(maxLength, len(name))

ITERATIONS = 100

for name, model in models:
    # print(name)
    spaces = ' ' * (maxLength - len(name))
    values = []
    bar = tqdm(range(ITERATIONS))
    avgAcc = 0
    total = 0
    predictedVals = []
    actualVals = []
    for i in bar:
        bar.set_description(f"{name}{spaces} | {avgAcc/(1 if total == 0 else total):.3f}")
        subsetDf3 = subsetDf2.groupby('Turncoat').apply(lambda x: x.sample(subsetDf2['Turncoat'].value_counts().min()).reset_index(drop=True))

        X = subsetDf3.drop('Turncoat', axis=1)
        Y = subsetDf3['Turncoat']

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=random.randint(0, 1000))

        model.fit(X_train, Y_train)

        predictions = model.predict(X_test)
        accuracy = accuracy_score(Y_test, predictions)
        predictedVals.extend(predictions)
        actualVals.extend(Y_test)
        values.append(accuracy)
        avgAcc += accuracy
        total += 1

    print(classification_report(actualVals, predictedVals))



Logistic Regression | 0.671: 100%|██████████| 100/100 [00:00<00:00, 122.88it/s]


              precision    recall  f1-score   support

           0       0.63      0.86      0.72     16427
           1       0.77      0.48      0.59     16173

    accuracy                           0.67     32600
   macro avg       0.70      0.67      0.66     32600
weighted avg       0.70      0.67      0.66     32600



Decision Tree       | 0.689: 100%|██████████| 100/100 [00:00<00:00, 174.05it/s]


              precision    recall  f1-score   support

           0       0.68      0.70      0.69     16291
           1       0.70      0.67      0.68     16309

    accuracy                           0.69     32600
   macro avg       0.69      0.69      0.69     32600
weighted avg       0.69      0.69      0.69     32600



Random Forest       | 0.711:  20%|██        | 20/100 [00:02<00:09,  8.07it/s]


KeyboardInterrupt: 