## Republican or Democrat

For this exercise we are going to use data from the 1984 United States Congressional Voting Records Database (take a look at the data dictionary) to predict if a congressmen/women is a republican or democrat

In [3]:
## Basic packages
import numpy as np
import pandas as pd

## Graphing packages
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [4]:
## Define the colum/variable/feature names
columns = [
    "class", 
    "handicapped_infants", 
    "water_project_cost", 
    "adoption_of_the_budget_resolution", 
    "physician_fee_freeze",
    "el_salvador_aid",
    "religious_groups_in_schools",
    "anti_satellite_test_ban",
    "aid_to_nicaraguan_contras",
    "mx_missile",
    "immigration",
    "synfuels_corporation_cutback",
    "education_spending",
    "superfund_right_to_sue",
    "crime",
    "duty_free_exports",
    "export_administration_act_south_africa"
]


'''We are going to read the data directly from the web'''
csv_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"

''' Here we are reading the data and create a binary var 0 for republican 1 for democrat'''
house_df = pd.read_csv(csv_url, names = columns)

house_df['class'] = house_df['class'].map(lambda value: 0 if value == "republican" else 1 )

In [5]:
# Print head of the data set

# ADD YOUR CODE HERE TO PRINT THE HEAD 

Unnamed: 0,class,handicapped_infants,water_project_cost,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa
0,0,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,0,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,1,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,1,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,1,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [6]:
## Lets clean the dataset
house_df.replace('?', np.nan, inplace=True)
house_df.ffill(inplace=True)

In [5]:
## Create dummy variable, use pd-get_dummies on the data set 

df_dummies = # ADD YOUR CODE HERE
df_dummies.head(3)

Unnamed: 0,class,handicapped_infants_n,handicapped_infants_y,water_project_cost_n,water_project_cost_y,adoption_of_the_budget_resolution_n,adoption_of_the_budget_resolution_y,physician_fee_freeze_n,physician_fee_freeze_y,el_salvador_aid_n,...,education_spending_n,education_spending_y,superfund_right_to_sue_n,superfund_right_to_sue_y,crime_n,crime_y,duty_free_exports_n,duty_free_exports_y,export_administration_act_south_africa_n,export_administration_act_south_africa_y
0,0,1,0,0,1,1,0,0,1,0,...,0,1,0,1,0,1,1,0,0,1
1,0,1,0,0,1,1,0,0,1,0,...,0,1,0,1,0,1,1,0,0,1
2,1,1,0,0,1,0,1,0,1,0,...,1,0,0,1,0,1,1,0,1,0


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
'''Define y and X'''
# OUR TARGET COLUMN IS "class"
# We need to apply dummies to the class column

y = # YOUR CODE HERE
columns_ = df_dummies.columns.tolist()
exclude_col = ['class']

X = df_dummies[[i for i in columns_ if i not in exclude_col]]

# print the X and y shapes
print (# YOUR CODE HERE )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(435, 32) (435,)
(304, 32) (304,)
(131, 32) (131,)


In [8]:
from sklearn.linear_model import LogisticRegression

'''Use scikit learn'''
r_d_logistic = LogisticRegression()
r_d_logistic.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
'''Baseline'''
'''Remeber that 0 is republican 1 is democrat'''

# print the value counts of each democrats and republicans.

print (#YOUR CODE HERE, "\n" )

print ("if I randomly choose, %.0f percent of the time I/we will be choosing democrat" 
        % ((np.mean(df_dummies['class']))*100))

1    267
0    168
Name: class, dtype: int64 

if I randomly choose, 61 percent of the time I/we will be choosing democrat


In [11]:
from sklearn.metrics import confusion_matrix, classification_report

## predicting
y_pred = r_d_logistic.predict(X_test)

# apply the confusion_matrix to y values

confmat = confusion_matrix(# your code here)


confusion = pd.DataFrame(confmat, index=['True_Label_0 Republican', 'True_Label_1 Democrat'],
                         columns=['Predict_Label_0 Republican', 'Predict_Label_1 Democrat'])

confusion

Unnamed: 0,Predict_Label_0 Republican,Predict_Label_1 Democrat
True_Label_0 Republican,49,2
True_Label_1 Democrat,3,77


# Let's get the TP, FP, TN, FN from the confusion matrix

In [13]:
TP = confusion.loc['True_Label_0 Republican', 'Predict_Label_0 Republican']  

FP = confusion.loc['True_Label_1 Democrat', 'Predict_Label_0 Republican']

TN = confusion.loc['True_Label_1 Democrat', 'Predict_Label_1 Democrat']

FN = confusion.loc['True_Label_0 Republican', 'Predict_Label_1 Democrat']

values = sorted(zip(['True Positives','False Positives','True Negatives','False Negatives'], [TP, FP, TN, FN]))
values

[('False Negatives', 2),
 ('False Positives', 3),
 ('True Negatives', 77),
 ('True Positives', 49)]

# Calculate accuracy, Misclassification Rate (Error Rate), Precision, Recall

In [14]:
## Accuracy
## How often is the classifier correct?
from sklearn.metrics import accuracy_score

# print accuracy_score for y_test and y_predict
acc = accuracy_score(# your code here)
    
print ("Accuracy score: %.3f" %(acc*100))

Accuracy score: 96.183


In [15]:
## Misclassification Rate (Error Rate)
## How often is the model wrong
# hint: use False positives and False negatives

print ("Error rate: %.3f" % (((#Your CODE here ####)) / float(len(y_test))*100))

Error rate: 3.817


In [16]:
## Precision
## Ability of the classifier to avoid labeling a class as a member of another class
from sklearn.metrics import precision_score

# print the precision_score of y_test and y_pred

pcs = # your code here
print ("Precision: %.3f" %(pcs*100))

Precision: 97.468


In [17]:
## Recall the ability of the classifier to correctly identify the current class
from sklearn.metrics import recall_score

# print the recall_score of y_test and y_predict
rcs = recall_score(#your code here)
print ("Recall: %.3f" % (rcs*100))

Recall: 96.250


In [18]:
# print a classification report

              precision    recall  f1-score   support

           0      0.942     0.961     0.951        51
           1      0.975     0.963     0.969        80

    accuracy                          0.962       131
   macro avg      0.958     0.962     0.960       131
weighted avg      0.962     0.962     0.962       131

