# Handling missing values II

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

In [5]:
# import and prepare data
df = pd.read_csv('../data/house-votes-84.csv')
df.columns = ['party', 'infants', 'water', 'budget', 'physician', 'salvador',
       'religious', 'satellite', 'aid', 'missile', 'immigration', 'synfuels',
       'education', 'superfund', 'crime', 'duty_free_exports', 'eaa_rsa']
df.sample(10).T

Unnamed: 0,151,53,339,328,326,137,196,126,285,343
party,democrat,democrat,republican,democrat,republican,democrat,republican,democrat,democrat,democrat
infants,n,y,n,y,n,n,n,n,n,n
water,y,y,n,?,y,n,?,?,n,n
budget,y,y,n,y,n,y,y,y,y,y
physician,n,n,y,n,y,n,y,n,n,n
salvador,n,n,y,n,y,n,y,n,n,n
religious,y,y,y,y,y,y,y,n,y,n
satellite,n,?,n,y,n,y,n,y,?,y
aid,y,y,n,y,n,y,n,y,y,y
missile,y,n,n,y,n,y,n,y,?,y


In [7]:
# transform data
df.replace(to_replace='n', value=0, inplace=True)
df.replace(to_replace='y', value=1, inplace=True)
df.replace(to_replace='?', value=np.nan, inplace=True)
df.sample(5).T

Unnamed: 0,259,158,31,412,153
party,democrat,democrat,democrat,republican,republican
infants,1,0,1,1,0
water,0,1,1,1,1
budget,1,1,1,1,0
physician,0,0,0,1,1
salvador,0,,0,1,1
religious,0,1,0,1,1
satellite,1,1,1,1,0
aid,1,1,1,1,0
missile,1,1,1,0,0


In [10]:
X = df.drop('party', axis=1).values
y = df.party.values

print(type(X), X.shape)
print(type(y), y.shape)

<class 'numpy.ndarray'> (434, 16)
<class 'numpy.ndarray'> (434,)


In [11]:
# print the number of NaNs
df.isnull().sum()

party                  0
infants               12
water                 48
budget                11
physician             11
salvador              15
religious             11
satellite             14
aid                   15
missile               22
immigration            7
synfuels              20
education             31
superfund             25
crime                 17
duty_free_exports     28
eaa_rsa              104
dtype: int64

Create a pipeline with two steps, **imputation**, followed by the instantiation of a classifier. We'll use sklearn's **SVM** (Support Vector Machine) classifier. It works just like sklearn's other classifiers, **knn**, **logisitc regression** and **decision tree**, having the same `.fit()` and `.predict()` methods.

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Setup the Imputation transformer to impute missing data (represented as 'NaN') 
# the 'most_frequent' strategy, applied to each column (axis=0).
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)

# Instantiate the classifier, SVC (Support Vector Classification) is a type of SVM.
clf = SVC()

# Setup the pipeline with the required steps:
# the first tuple should consist of the imputation step
# the second should consist of the classifier.
steps = [('imputation', imp),
        ('SVM', clf)]
steps



[('imputation',
  Imputer(axis=0, copy=True, missing_values='NaN', strategy='most_frequent',
      verbose=0)),
 ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))]

Having setup the steps of the pipeline in the previous exercise, you will now use it on the voting dataset to classify a Congressman's party affiliation. What makes pipelines so incredibly useful is the simple interface that they provide. You can use the .fit() and .predict() methods on pipelines just as you did with your classifiers and regressors!

In [19]:
# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Compute metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    democrat       1.00      0.93      0.96        83
  republican       0.89      1.00      0.94        48

   micro avg       0.95      0.95      0.95       131
   macro avg       0.94      0.96      0.95       131
weighted avg       0.96      0.95      0.95       131



