In [1]:
import numpy as np
import pandas as pd


# Centering and Scaling

Why scale your data?

- Many models use some form of distance to inform them
- Features on larger scales can unduly influence the model 
- We want features to be on a similar scale 
- Normalizing(or scaling and centering)


In [2]:
# import red wine data set
df = pd.read_csv("datasets/red_wine.csv")


In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1595.0,1597.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.838871,46.428929,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.423696,32.89757,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
# check for missing or NaN/null values (in case we need to impute in our pipeline later)
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     4
total sulfur dioxide    2
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [6]:
# create target and feature sets
X = df.drop("quality", axis = 1)
y = df["quality"]

In [7]:
y.value_counts(ascending=True)

3     10
8     18
4     53
7    199
6    638
5    681
Name: quality, dtype: int64

In [8]:
# convert target from multi class to binary (ie quality is poor(3-5)= 0 and good(6-8)= 1)

def to_binary(x):
    
    if x < 6:
        return 0 
    else:
        return 1

In [9]:
y = y.apply(to_binary)
y

0       0
1       0
2       0
3       1
4       0
       ..
1594    0
1595    1
1596    1
1597    0
1598    1
Name: quality, Length: 1599, dtype: int64

Ways to normalize your data

- Standardization:Subtract the mean and divide by variance 
- All features are centered around zero and have variance one 


- Can also subtract the minimum and divide by the range 
- Minimum zero and maximum one


- Can also normalize so the data ranges from -1 to +1

**Scaling in Scikit-learn**

In [10]:
from sklearn.preprocessing import scale

X_scaled = scale(X)

In [11]:
# code to convert np.array to dataframes for comparision purposes
cols = X.columns
Z = pd.DataFrame(X_scaled, columns=cols)

In [12]:
np.mean(X['total sulfur dioxide']), np.std(X['total sulfur dioxide'])

(46.42892924232937, 32.88726863072334)

In [13]:
np.mean(Z['total sulfur dioxide']), np.std(Z['total sulfur dioxide'])

(7.388162300691203e-17, 0.9999999999999988)

**Scaling in a pipeline**

In [14]:
# scaled data

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

steps = [('imputer', SimpleImputer()),('scaler', StandardScaler()),
         ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

knn_scaled = pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy_score(y_test, y_pred)



0.75

In [15]:
# unscaled data

steps = [('imputer', SimpleImputer()),
         ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

knn_unscaled = pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy_score(y_test, y_pred)


0.65

**CV and scaling in a pipeline**

In [16]:
from sklearn.model_selection import GridSearchCV

steps = [('imputer', SimpleImputer()),('scaler', StandardScaler()),
         ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)

parameters = {'knn__n_neighbors': np.arange(1,50)}  # model hyperparameters in a pipeline follow model name preceded by __ (eg: knn__n_neighbours) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

cv = GridSearchCV(pipeline, param_grid=parameters)

cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)

In [17]:
from sklearn.metrics import classification_report

print(cv.best_params_)
print(cv.score(X_test, y_test))
print(classification_report(y_test, y_pred))

{'knn__n_neighbors': 1}
0.753125
              precision    recall  f1-score   support

           0       0.71      0.76      0.73       144
           1       0.79      0.75      0.77       176

    accuracy                           0.75       320
   macro avg       0.75      0.75      0.75       320
weighted avg       0.76      0.75      0.75       320



**Pipeline for classification**

- using the SVM classifier. 
- The hyperparameters you tune are "C" and "gamma". 
- "C" controls the regularization strength. It is analogous to the "C" tuned for logistic regression
- "gamma" controls the kernel coefficient

In [18]:
from sklearn.svm import SVC

# Setup the pipeline
steps = [('imputer', SimpleImputer()),('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 21)

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred =  cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

Accuracy: 0.7625
              precision    recall  f1-score   support

           0       0.72      0.78      0.75       144
           1       0.81      0.74      0.78       176

    accuracy                           0.76       320
   macro avg       0.76      0.76      0.76       320
weighted avg       0.77      0.76      0.76       320

Tuned Model Parameters: {'SVM__C': 100, 'SVM__gamma': 0.01}


**Pipeline for Regression**

- using Gapminder dataset
- build a pipeline that imputes the missing data, scales the features, and fits an ElasticNet to the Gapminder data
- You will then tune the l1_ratio of your ElasticNet using GridSearchCV.



In [19]:
df2 = pd.read_csv('datasets/Gapminder.csv')


In [20]:
df2.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
1,34811059.0,2.73,0.1,3.328944661018629,24.5962,12314.0,129.9049,75.3,29.5,Middle East & North Africa
2,19842251.0,6.43,2.0,1.4743533878509398,22.25083,7103.0,130.1247,58.3,192.0,Sub-Saharan Africa
3,40381860.0,2.24,0.5,4.78516998252535,27.5017,14646.0,118.8915,75.5,15.4,America
4,2975029.0,1.4,0.1,1.8041062172001,25.35542,7383.0,132.8108,72.5,20.0,Europe & Central Asia


In [21]:
df2.isnull().sum()

population         0
fertility          0
HIV                0
CO2                0
BMI_male           0
GDP                0
BMI_female         0
life               0
child_mortality    0
Region             0
dtype: int64

In [22]:
# remove unwanted artifact from line 0

df2 = df2.drop(0, axis=0)

df2.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
1,34811059.0,2.73,0.1,3.328944661018629,24.5962,12314.0,129.9049,75.3,29.5,Middle East & North Africa
2,19842251.0,6.43,2.0,1.4743533878509398,22.25083,7103.0,130.1247,58.3,192.0,Sub-Saharan Africa
3,40381860.0,2.24,0.5,4.78516998252535,27.5017,14646.0,118.8915,75.5,15.4,America
4,2975029.0,1.4,0.1,1.8041062172001,25.35542,7383.0,132.8108,72.5,20.0,Europe & Central Asia
5,21370348.0,1.96,0.1,18.0163132681972,27.56373,41312.0,117.3755,81.5,5.2,East Asia & Pacific


In [23]:
df2.describe()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
count,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139.0,139
unique,139.0,116.0,45.0,139.0,139.0,139.0,139.0,109.0,131.0,6
top,34811059.0,1.43,0.1,3.328944661018629,24.5962,12314.0,129.9049,75.3,4.7,Europe & Central Asia
freq,1.0,3.0,26.0,1.0,1.0,1.0,1.0,3.0,2.0,41


In [24]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 1 to 139
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   population       139 non-null    object
 1   fertility        139 non-null    object
 2   HIV              139 non-null    object
 3   CO2              139 non-null    object
 4   BMI_male         139 non-null    object
 5   GDP              139 non-null    object
 6   BMI_female       139 non-null    object
 7   life             139 non-null    object
 8   child_mortality  139 non-null    object
 9   Region           139 non-null    object
dtypes: object(10)
memory usage: 11.0+ KB


In [25]:
# encode categorical column

df2 = pd.get_dummies(df2, columns=['Region'])

df2.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region_America,Region_East Asia & Pacific,Region_Europe & Central Asia,Region_Middle East & North Africa,Region_South Asia,Region_Sub-Saharan Africa
1,34811059.0,2.73,0.1,3.328944661018629,24.5962,12314.0,129.9049,75.3,29.5,0,0,0,1,0,0
2,19842251.0,6.43,2.0,1.4743533878509398,22.25083,7103.0,130.1247,58.3,192.0,0,0,0,0,0,1
3,40381860.0,2.24,0.5,4.78516998252535,27.5017,14646.0,118.8915,75.5,15.4,1,0,0,0,0,0
4,2975029.0,1.4,0.1,1.8041062172001,25.35542,7383.0,132.8108,72.5,20.0,0,0,1,0,0,0
5,21370348.0,1.96,0.1,18.0163132681972,27.56373,41312.0,117.3755,81.5,5.2,0,1,0,0,0,0


In [26]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 1 to 139
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   population                         139 non-null    object
 1   fertility                          139 non-null    object
 2   HIV                                139 non-null    object
 3   CO2                                139 non-null    object
 4   BMI_male                           139 non-null    object
 5   GDP                                139 non-null    object
 6   BMI_female                         139 non-null    object
 7   life                               139 non-null    object
 8   child_mortality                    139 non-null    object
 9   Region_America                     139 non-null    uint8 
 10  Region_East Asia & Pacific         139 non-null    uint8 
 11  Region_Europe & Central Asia       139 non-null    uint8 
 12  Region_M

In [27]:
# Convert all columns to floats

df2 = df2.astype('float', errors ='ignore')

# USEFUL CODE :)
# loop to convert each column to float if .astype() does not work:
#
# for col in df2.columns:
#     df_col = df2[col]
#     df_col = pd.to_numeric(df_col, errors='coerce')
#     df2[col] = df_col
# 
# eg: df2["population"] = pd.to_numeric(df2.population, errors='coerce')


print(df2.dtypes)


population                           float64
fertility                            float64
HIV                                  float64
CO2                                  float64
BMI_male                             float64
GDP                                  float64
BMI_female                           float64
life                                 float64
child_mortality                      float64
Region_America                       float64
Region_East Asia & Pacific           float64
Region_Europe & Central Asia         float64
Region_Middle East & North Africa    float64
Region_South Asia                    float64
Region_Sub-Saharan Africa            float64
dtype: object


In [28]:
df2.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region_America,Region_East Asia & Pacific,Region_Europe & Central Asia,Region_Middle East & North Africa,Region_South Asia,Region_Sub-Saharan Africa
1,34811059.0,2.73,0.1,3.328945,24.5962,12314.0,129.9049,75.3,29.5,0.0,0.0,0.0,1.0,0.0,0.0
2,19842251.0,6.43,2.0,1.474353,22.25083,7103.0,130.1247,58.3,192.0,0.0,0.0,0.0,0.0,0.0,1.0
3,40381860.0,2.24,0.5,4.78517,27.5017,14646.0,118.8915,75.5,15.4,1.0,0.0,0.0,0.0,0.0,0.0
4,2975029.0,1.4,0.1,1.804106,25.35542,7383.0,132.8108,72.5,20.0,0.0,0.0,1.0,0.0,0.0,0.0
5,21370348.0,1.96,0.1,18.016313,27.56373,41312.0,117.3755,81.5,5.2,0.0,1.0,0.0,0.0,0.0,0.0


In [29]:
# create target and feature sets
X = df2.drop("life", axis = 1)
y = df2["life"]

In [30]:
from sklearn.linear_model import ElasticNet

# Setup the pipeline steps: steps
steps = [('scaler', StandardScaler()),('elasticnet', ElasticNet())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0, 1, 50)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 42)

# Instantiate the GridSearchCV object: cv
gm_cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
gm_cv.fit(X_train, y_train)

# Compute and print metrics
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))





  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Tuned ElasticNet Alpha: {'elasticnet__l1_ratio': 1.0}
Tuned ElasticNet R squared: 0.8862016549771035
