In [32]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

#plotting
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="ticks", color_codes=True)

#preprocessing
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN

#modelling
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, svm

#metrics
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


In [33]:
#import dataset, add hyperparameters
multiclass_df = pd.read_csv('/Users/agar/_METIS/exercises/Project_3/data_source/engineered_data')

new_features_df = multiclass_df

new_features_df["super_left_better"] = multiclass_df['asian_pop']*multiclass_df['asian_vote']*multiclass_df['white_bachelors']
new_features_df["super_param"] = multiclass_df['latino_adult_percent']*multiclass_df['latino_pop']*multiclass_df['other_pop']*multiclass_df['state_fips']*multiclass_df['white_bachelors']*multiclass_df['bachelors']*multiclass_df['income_median']*multiclass_df['metro1_percent_pop']*multiclass_df['other_pop']
new_features_df["super_param_2"] = new_features_df["super_param"]**2
new_features_df["super_left"] = multiclass_df['asian_pop']*multiclass_df['asian_vote']*multiclass_df['asian_adult_percent']*multiclass_df['latino_vote']*multiclass_df['white_bachelors']*multiclass_df['larg1_percent_pop']
new_features_df["super_right"] = multiclass_df['white_adult_percent']*multiclass_df['white_pop']*multiclass_df['white_vote']*multiclass_df['white_no_college']*multiclass_df['metro_none_percent_pop']*multiclass_df['metro3_percent_pop']*multiclass_df['metro2_percent_pop']*multiclass_df['native_adult_percent']*multiclass_df['native_pop']*multiclass_df['native_vote']
new_features_df["super_middle"] = multiclass_df['black_adult_percent']*multiclass_df['black_pop']*multiclass_df['black_vote']

financials_df = pd.read_csv('/Users/agar/_METIS/exercises/Project_3/financials_df.csv')

mask_years = new_features_df["year"] != 2010 
new_features_years = new_features_df[mask_years]
new_features_years['ID_DIST'] = new_features_years['year'].astype(str) + "-" + new_features_years['district_id']

finance_demo = pd.merge(new_features_years, financials_df, how='left', on=['ID_DIST'])
finance_demo = finance_demo.dropna()

prev_party_df = pd.read_csv("/Users/agar/_METIS/exercises/Project_3/prev_party_id.csv")
finance_demo_prev_part = pd.merge(finance_demo, prev_party_df, how='left', left_on=['year_x', "district_id_x"], right_on=['year', "district_id"])
finance_demo_prev_part = finance_demo_prev_part.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [34]:
#create test and train
mask = finance_demo_prev_part["year_x"] == 2018
holdout = finance_demo_prev_part[mask]

mask_2 = finance_demo_prev_part["year"] != 2018
train = finance_demo_prev_part[mask_2]

#test: define X feature and y target 
X_holdout = holdout.drop(['ID_DIST', 'Unnamed: 0_y', 'Cand_State',
       'Cand_Office_Dist', 'year_y', 'Unnamed: 0_x', 'year_x', 'district_id_x', 'district_id_y', 'Unnamed: 0', 'district_id', 'party_change_simple', "district_id", "party", "candidate_votes", "totalvotes", "Unnamed: 0"], axis=1)
y_holdout = holdout['party_change_simple']

#training: define X feature and y target 
y_train = train['party_change_simple'].apply(np.int64)
X_train = train.drop(['ID_DIST', 'Unnamed: 0_y', 'Cand_State',
       'Cand_Office_Dist', 'year_y', 'Unnamed: 0_x', 'year_x', 'district_id_x', 'district_id_y', 'Unnamed: 0', 'district_id', 'party_change_simple', "district_id", "party", "candidate_votes", "totalvotes", "Unnamed: 0"], axis=1)

In [6]:
X_train, y_train = np.array(X_train), np.array(y_train)

In [24]:
from collections import Counter
print(sorted(Counter(y_train).items()))

[(0, 1097), (1, 32)]


In [28]:
#split - crossval fail
skf = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)
cv_lr_re, cv_gnb_re, cv_dtc_re, cv_rfc_re, cv_svm_re, cv_knn_re = [], [], [], [], [], [] #collect the validation results for both models
cv_lr_pre, cv_gnb_pre, cv_dtc_pre, cv_rfc_pre, cv_svm_pre, cv_knn_pre = [], [], [], [], [], []

for train_ind, val_ind in skf.split(X_train, y_train):
    
    X_tr_skf, y_tr_skf = X_train[train_ind], y_train[train_ind]
    #oversample minority class 1
    X_tr_smote, y_tr = SMOTE().fit_resample(X_tr_skf, y_tr_skf)
    #standar scale
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr_smote)
    
    X_val, y_val = X_train[val_ind], y_train[val_ind]
    X_scaled_val = scaler.transform(X_val)
    
    print(sorted(Counter(y_tr).items()))
    print(sorted(Counter(y_val).items()))
    
    #fit models
    lr=LogisticRegression(max_iter=10000)
    lr.fit(X_tr, y_tr)

    y_pred_lr=lr.predict(X_val)
    print(sorted(Counter(y_pred_lr).items()))
    recall_lr = recall_score(y_val, y_pred_lr)
    precision_lr = precision_score(y_val, y_pred_lr)
    cv_lr_re.append(recall_lr)
    cv_lr_pre.append(precision_lr)

    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(X_tr, y_tr)

    y_pred_knn=knn.predict(X_val)
    recall_knn = recall_score(y_val, y_pred_knn)
    precision_knn = precision_score(y_val, y_pred_knn)
    cv_knn_re.append(recall_knn)
    cv_knn_pre.append(precision_knn)

    gnb = GaussianNB()
    gnb.fit(X_tr, y_tr)

    y_pred_gnb=gnb.predict(X_val)
    recall_gnb = recall_score(y_val, y_pred_gnb)
    precision_gnb = precision_score(y_val, y_pred_gnb)
    cv_gnb_re.append(recall_gnb)
    cv_gnb_pre.append(precision_gnb)

    svm_ = svm.SVC(probability = True)
    svm_.fit(X_tr, y_tr)
    
    y_pred_svm=svm_.predict(X_val)
    recall_svm = recall_score(y_val, y_pred_svm)
    precision_svm = precision_score(y_val, y_pred_svm)
    cv_svm_re.append(recall_svm)
    cv_svm_pre.append(precision_svm)
    
    """
    dtc = DecisionTreeClassifier(random_state=0)
    dtc.fit(X_tr, y_tr)

    y_pred_dtc=dtc.predict(X_val)
    recall_dtc = recall_score(y_val, y_pred_dtc)
    precision_dtc = precision_score(y_val, y_pred_dtc)
    cv_dtc_re.append(recall_dtc)
    cv_dtc_pre.append(precision_dtc)
    
    rfc = RandomForestClassifier() 
    rfc.fit(X_tr, y_tr)
    
    y_pred_rfc=rfc.predict(X_val)
    recall_rfc = recall_score(y_val, y_pred_rfc)
    precision_rfc = precision_score(y_val, y_pred_rfc)
    cv_rfc_re.append(recall_rfc)
    cv_rfc_pre.append(precision_rfc)
    """


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[(0, 731), (1, 731)]
[(0, 366), (1, 11)]
[(0, 377)]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[(0, 731), (1, 731)]
[(0, 366), (1, 10)]
[(0, 376)]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[(0, 732), (1, 732)]
[(0, 365), (1, 11)]
[(0, 376)]


  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
print('LR: ','\n')
print('Simple recall scores: ', cv_lr_re)
print('Simple precision scores: ', cv_lr_pre, '\n')

print('GNB: ','\n')
print('Simple recall scores: ', cv_gnb_re)
print('Simple precision scores: ', cv_gnb_pre, '\n')

print('KNN: ','\n')
print('Simple recall scores: ', cv_knn_re)
print('Simple precision scores: ', cv_knn_pre, '\n')

print('SVM: ','\n')
print('Simple recall scores: ', cv_svm_re)
print('Simple precision scores: ', cv_svm_pre, '\n')

print('LR: ','\n')
print(f'mean recall : {np.mean(cv_lr_re):.3f} +- {np.std(cv_lr_re):.3f}')
print(f'mean precision : {np.mean(cv_lr_re):.3f} +- {np.std(cv_lr_re):.3f}')

print('GNB: ','\n')
print(f'mean recall : {np.mean(cv_gnb_re):.3f} +- {np.std(cv_gnb_re):.3f}')
print(f'mean precision : {np.mean(cv_gnb_re):.3f} +- {np.std(cv_gnb_re):.3f}')

print('KNN: ','\n')
print(f'mean recall : {np.mean(cv_knn_re):.3f} +- {np.std(cv_knn_re):.3f}')
print(f'mean precision : {np.mean(cv_knn_re):.3f} +- {np.std(cv_knn_re):.3f}')

print('SVM: ','\n')
print(f'mean recall : {np.mean(cv_svm_re):.3f} +- {np.std(cv_svm_re):.3f}')
print(f'mean precision : {np.mean(cv_svm_re):.3f} +- {np.std(cv_svm_re):.3f}')

LR:  

Simple recall scores:  [0.0, 0.0, 0.0]
Simple precision scores:  [0.0, 0.0, 0.0] 

GNB:  

Simple recall scores:  [0.0, 0.0, 0.0]
Simple precision scores:  [0.0, 0.0, 0.0] 

KNN:  

Simple recall scores:  [0.0, 0.0, 0.0]
Simple precision scores:  [0.0, 0.0, 0.0] 

SVM:  

Simple recall scores:  [0.0, 0.0, 0.0]
Simple precision scores:  [0.0, 0.0, 0.0] 

LR:  

mean recall : 0.000 +- 0.000
mean precision : 0.000 +- 0.000
GNB:  

mean recall : 0.000 +- 0.000
mean precision : 0.000 +- 0.000
KNN:  

mean recall : 0.000 +- 0.000
mean precision : 0.000 +- 0.000
SVM:  

mean recall : 0.000 +- 0.000
mean precision : 0.000 +- 0.000
