# Preprocessing

In [2]:
import os
os.chdir('/Users/Vincent/Desktop/Python/DataCamp/Data')
import pandas as pd
import numpy as np

## Missing data

In [3]:
votes = pd.read_csv('house-votes-84.csv')
votes.columns = ['party','infants', 'water', 'budget', 'physician', 'salvador', 'religious',
       'satellite', 'aid', 'missile', 'immigration', 'synfuels', 'education',
       'superfund', 'crime', 'duty_free_exports', 'eaa_rsa']
votes = votes.replace(to_replace=['n', 'y'], value=[0, 1])
#Convert '?' to NaN
votes[votes == "?"] = np.nan

# Print the number of NaNs
print(votes.isnull().sum())
# Print shape of original DataFrame
print("Shape of Original DataFrame: {}".format(votes.shape))

# Drop missing values and print shape of new DataFrame
votes = votes.dropna()
print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(votes.shape))

party                  0
infants               12
water                 48
budget                11
physician             11
salvador              15
religious             11
satellite             14
aid                   15
missile               22
immigration            7
synfuels              20
education             31
superfund             25
crime                 17
duty_free_exports     28
eaa_rsa              104
dtype: int64
Shape of Original DataFrame: (434, 17)
Shape of DataFrame After Dropping All Rows with Missing Values: (232, 17)


In [31]:
ufo = pd.read_csv('ufo_sightings_large.csv')

# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

print(ufo_no_missing.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4283, 11)


# Date-time

In [32]:
import warnings
warnings.filterwarnings("ignore")

def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
# Apply the extraction to the length_of_time column
ufo_no_missing["minutes"] = ufo_no_missing["length_of_time"].apply(lambda row: return_minutes(row))

# Take a look at the head of both of the columns
print(ufo_no_missing[["length_of_time","minutes"]].head())

    length_of_time  minutes
0          2 weeks      2.0
1           30sec.     30.0
3  about 5 minutes      NaN
4                2      2.0
5       10 minutes     10.0


In [33]:
# Check the column types
print(ufo[["seconds", "date"]].dtypes)

# Change the type of seconds to float
ufo["seconds"] = ufo["seconds"].astype(float)
# Change the date column to type datetime
ufo["date"] = pd.to_datetime(ufo["date"])

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda row: row.year)

print(ufo[["date", "month", "year"]].head())

seconds    float64
date        object
dtype: object
                 date  month  year
0 2011-11-03 19:21:00     11  2011
1 2004-10-03 19:05:00     10  2004
2 2009-09-25 21:00:00      9  2009
3 2002-11-21 05:45:00     11  2002
4 2010-08-19 12:55:00      8  2010


## Stratified sampling

In [34]:
volunteer['category_desc'].value_counts()

Strengthening Communities    307
Helping Neighbors in Need    119
Education                     92
Health                        52
Environment                   32
Emergency Preparedness        15
Name: category_desc, dtype: int64

In [36]:
from sklearn.model_selection import train_test_split

volunteer_X = volunteer.drop("category_desc", axis=1)
volunteer_y = volunteer[["category_desc"]]

# Use stratified sampling to split up the dataset according to the class distribution
X_train, X_test, y_train, y_test = train_test_split(volunteer_X, volunteer_y, stratify=volunteer_y)
print(y_train["category_desc"].value_counts())

Strengthening Communities    230
Helping Neighbors in Need     89
Education                     69
Health                        39
Environment                   24
Emergency Preparedness        11
Name: category_desc, dtype: int64


# Categorical feature encoding

In [38]:
df = pd.read_csv('gm_2008_region.csv')
# Create dummy variables: df_region
df_region = pd.get_dummies(df)
print(df_region.columns)

# Create dummy variables without region_America
df_region = pd.get_dummies(df, drop_first=True)
print(df_region.columns)

# Append design matrix with dummy variables for categorical variable
X = df.drop(['life', 'Region'], axis=1)
X = np.append(X, df_region, axis=1)
y = df['life']

Index(['population', 'fertility', 'HIV', 'CO2', 'BMI_male', 'GDP',
       'BMI_female', 'life', 'child_mortality', 'Region_America',
       'Region_East Asia & Pacific', 'Region_Europe & Central Asia',
       'Region_Middle East & North Africa', 'Region_South Asia',
       'Region_Sub-Saharan Africa'],
      dtype='object')
Index(['population', 'fertility', 'HIV', 'CO2', 'BMI_male', 'GDP',
       'BMI_female', 'life', 'child_mortality', 'Region_East Asia & Pacific',
       'Region_Europe & Central Asia', 'Region_Middle East & North Africa',
       'Region_South Asia', 'Region_Sub-Saharan Africa'],
      dtype='object')


In [39]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == "us" else 0)

# Print the number of unique type values
print(len(ufo["type"].unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo["type"])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

22


In [40]:
# Ames housing data (unprocessed)
df = pd.read_csv('ames_unprocessed_data.csv')

# label encoding
from sklearn.preprocessing import LabelEncoder

# Fill missing values with 0
df.LotFrontage = df.LotFrontage.fillna(0)

# Create a boolean mask for categorical columns
categorical_mask = (df.dtypes == object)

# Get list of categorical column names
categorical_columns = df.columns[categorical_mask].tolist()
print(df[categorical_columns].head())

le = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

print(df[categorical_columns].head())

  MSZoning Neighborhood BldgType HouseStyle PavedDrive
0       RL      CollgCr     1Fam     2Story          Y
1       RL      Veenker     1Fam     1Story          Y
2       RL      CollgCr     1Fam     2Story          Y
3       RL      Crawfor     1Fam     2Story          Y
4       RL      NoRidge     1Fam     2Story          Y
   MSZoning  Neighborhood  BldgType  HouseStyle  PavedDrive
0         3             5         0           5           2
1         3            24         0           2           2
2         3             5         0           5           2
3         3             6         0           5           2
4         3            15         0           5           2


In [41]:
import warnings
warnings.filterwarnings("ignore")
# One hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=categorical_mask,sparse=False)
# ouptput = Numpy array!
df_encoded = ohe.fit_transform(df)

# Print the shape 
print(df.shape)
print(df_encoded.shape)

(1460, 21)
(1460, 62)


In [42]:
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn_pandas import CategoricalImputer
from sklearn.pipeline import FeatureUnion

df = pd.read_csv('chronic_kidney_disease.csv', header=None)
df.columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr',
              'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv',
              'wc', 'rc', 'rbc', 'pc', 'pcc', 'ba',
              'htn', 'dm', 'cad', 'appet', 'pe', 'ane','ckd']
X, y = df.iloc[:,:-1], df.iloc[:,-1]
df[df == "?"] = np.nan

for i in range(0,5):
    X.iloc[:,i]=X.iloc[:,i].astype(float)
for i in range(9,18):
    X.iloc[:,i]=X.iloc[:,i].astype(float)

# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)

# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object
categorical_columns = X.columns[categorical_feature_mask].tolist()
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature],SimpleImputer(strategy="median")) 
                                             for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )
# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) 
                                                 for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )
# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])

age        9
bp        12
sg        47
al        46
su        49
bgr      152
bu        65
sc         4
sod        4
pot       44
hemo      19
pcv       17
wc        87
rc        88
rbc       52
pc        71
pcc      106
ba       131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
dtype: int64


# Pipelines

In [57]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings

warnings.filterwarnings("ignore")

y = votes.loc[:,'party'] 
X = votes.drop('party', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Setup the pipeline steps: steps
steps = [('imputation', SimpleImputer(missing_values='NaN', strategy='most_frequent')),
        ('SVM', SVC())]
pipeline = Pipeline(steps)

# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Compute metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    democrat       0.97      0.97      0.97        36
  republican       0.97      0.97      0.97        34

   micro avg       0.97      0.97      0.97        70
   macro avg       0.97      0.97      0.97        70
weighted avg       0.97      0.97      0.97        70



In [58]:
# Import the necessary modules
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the pipeline to the training set
knn_scaled = pipeline.fit(X_train, y_train)

# Instantiate and fit a k-NN classifier to the unscaled data
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)

# Compute and print metrics
print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test, y_test)))
print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test)))

Accuracy with Scaling: 0.9714285714285714
Accuracy without Scaling: 0.9714285714285714
