In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2, SelectKBest
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score, classification_report
from scipy.stats.distributions import uniform
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Regression

[Dataset](https://archive.ics.uci.edu/ml/datasets/air+quality)

In [2]:
df = pd.read_excel('AirQualityUCI.xlsx')

In [3]:
df.shape

(9357, 15)

In [4]:
df.dtypes

Date             datetime64[ns]
Time                     object
CO(GT)                  float64
PT08.S1(CO)             float64
NMHC(GT)                  int64
C6H6(GT)                float64
PT08.S2(NMHC)           float64
NOx(GT)                 float64
PT08.S3(NOx)            float64
NO2(GT)                 float64
PT08.S4(NO2)            float64
PT08.S5(O3)             float64
T                       float64
RH                      float64
AH                      float64
dtype: object

In [5]:
df = df.drop(['AH', 'Date', 'Time'], axis=1)

In [6]:
df.columns

Index(['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'T', 'RH'],
      dtype='object')

In [7]:
df_train, df_test = train_test_split(df)

# Variance Inflation Factor

In [8]:
X = df_train.drop('RH', axis=1)

In [9]:
X = add_constant(X)
vifs = pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)
vifs

  return ptp(axis=axis, out=out, **kwargs)


const            358.067816
CO(GT)             1.913497
PT08.S1(CO)       22.105974
NMHC(GT)           1.477787
C6H6(GT)         101.793475
PT08.S2(NMHC)     23.142902
NOx(GT)            6.089215
PT08.S3(NOx)       6.845310
NO2(GT)            4.664109
PT08.S4(NO2)      10.050466
PT08.S5(O3)       11.590680
T                 58.741003
dtype: float64

In [10]:
vifs[1:].max()

101.7934747001686

In [11]:
while vifs[1:].max() > 5:
    print(vifs[1:].idxmax())
    X.drop(
        vifs[1:].idxmax(),
        axis=1,
        inplace=True
    )
    vifs = pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

X.head()    

C6H6(GT)
PT08.S2(NMHC)
PT08.S1(CO)
T
NOx(GT)


Unnamed: 0,const,CO(GT),NMHC(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3)
6504,1.0,3.1,-200,569.5,123.0,1619.5,1388.5
126,1.0,2.1,155,852.333333,103.0,1731.333333,1271.666667
8156,1.0,1.1,-200,666.25,92.3,1278.25,821.75
8276,1.0,1.6,-200,774.75,146.0,970.5,756.75
30,1.0,1.7,55,1253.5,97.0,1375.0,815.5


In [12]:
vifs

const           19.350456
CO(GT)           1.894898
NMHC(GT)         1.052237
PT08.S3(NOx)     1.277297
NO2(GT)          2.141307
PT08.S4(NO2)     2.846438
PT08.S5(O3)      2.971297
dtype: float64

In [13]:
vif_keep_cols = ['RH'] + list(X.drop('const', axis=1).columns)

In [14]:
vif_keep_cols

['RH',
 'CO(GT)',
 'NMHC(GT)',
 'PT08.S3(NOx)',
 'NO2(GT)',
 'PT08.S4(NO2)',
 'PT08.S5(O3)']

# Correlation

In [15]:
corrs = df_train.corr()['RH'].abs().sort_values(ascending=False)
corrs

RH               1.000000
C6H6(GT)         0.929167
T                0.892274
PT08.S1(CO)      0.756462
PT08.S4(NO2)     0.652371
PT08.S2(NMHC)    0.598849
PT08.S3(NOx)     0.584391
PT08.S5(O3)      0.537417
NO2(GT)          0.086953
NOx(GT)          0.069487
CO(GT)           0.050472
NMHC(GT)         0.007804
Name: RH, dtype: float64

In [16]:
corr_keep_cols = corrs.index[:8]
corr_keep_cols

Index(['RH', 'C6H6(GT)', 'T', 'PT08.S1(CO)', 'PT08.S4(NO2)', 'PT08.S2(NMHC)',
       'PT08.S3(NOx)', 'PT08.S5(O3)'],
      dtype='object')

# Randomized Search CV

In [17]:
X_train = df_train[vif_keep_cols].iloc[:,1:]
y_train = df_train[vif_keep_cols].iloc[:,0]
X_train.columns

Index(['CO(GT)', 'NMHC(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)',
       'PT08.S5(O3)'],
      dtype='object')

In [18]:
X_test = df_test[vif_keep_cols].iloc[:,1:]
y_test = df_test[vif_keep_cols].iloc[:,0]
y_test.name

'RH'

In [19]:
rcv = RandomizedSearchCV(RandomForestRegressor(),
                  {'n_estimators': range(10,100, 10),
                   'min_samples_split': uniform()
                  },
                  scoring=make_scorer(mean_squared_error))

In [20]:
rcv = rcv.fit(X_train, y_train)



In [21]:
pd.DataFrame(data=rcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.37218,0.003444,0.012079,0.000223,0.270965,50,"{'min_samples_split': 0.2709646947892821, 'n_e...",250.610319,254.902007,254.284068,253.265465,1.894344,7
1,0.078325,0.00042,0.004538,0.000214,0.243886,10,"{'min_samples_split': 0.2438858553354759, 'n_e...",252.78738,250.080266,252.814259,251.893968,1.282528,8
2,0.059236,0.000416,0.00738,8.4e-05,0.666124,40,"{'min_samples_split': 0.6661236457476513, 'n_e...",3297.60317,2612.125066,2351.994518,2753.907585,398.848879,2
3,0.11514,0.002635,0.012878,0.000186,0.684886,80,"{'min_samples_split': 0.6848857131953626, 'n_e...",3296.914751,2612.037065,2351.355028,2753.435615,398.761382,3
4,0.03153,0.000572,0.005165,4.5e-05,0.872321,20,"{'min_samples_split': 0.8723205106250693, 'n_e...",3297.275676,2611.994081,2353.043019,2754.104259,398.363573,1
5,0.160605,0.004422,0.006722,0.000333,0.197231,20,"{'min_samples_split': 0.19723129575454612, 'n_...",238.887176,242.202479,247.248622,242.779425,3.437838,9
6,0.181489,0.00148,0.007036,0.00024,0.136826,20,"{'min_samples_split': 0.13682552640932555, 'n_...",228.391109,234.11309,233.3845,231.9629,2.543092,10
7,0.504998,0.016481,0.016333,0.000346,0.47295,80,"{'min_samples_split': 0.4729498260664814, 'n_e...",261.545356,267.641008,266.93522,265.373861,2.722453,5
8,0.404011,0.050591,0.013344,0.000162,0.530764,70,"{'min_samples_split': 0.5307644817774131, 'n_e...",271.793603,273.685999,278.106265,274.528623,2.645113,4
9,0.54646,0.013849,0.017631,0.000478,0.327662,80,"{'min_samples_split': 0.3276624209145824, 'n_e...",257.507257,260.361348,259.863441,259.244016,1.244782,6


In [22]:
rfr = rcv.best_estimator_

In [23]:
y_predict = rfr.predict(X_test)
mean_squared_error(y_test, y_predict)

2243.5235142065667

# Classification

In [24]:
df = pd.read_csv('~/DATA/Iris.csv', index_col=0)

In [25]:
df.shape

(150, 5)

In [26]:
df.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [27]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1])

# Chi Squared

In [28]:
chi_best = SelectKBest(chi2, k=3)
X_train_fs = chi_best.fit_transform(X_train, y_train)
X_test_fs = chi_best.transform(X_test)

In [29]:
chi_best.pvalues_

array([3.10362864e-02, 2.13755627e-01, 5.12881988e-19, 1.76045283e-11])

In [30]:
X_train.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
147,6.3,2.5,5.0,1.9
27,5.0,3.4,1.6,0.4
88,6.3,2.3,4.4,1.3
15,5.8,4.0,1.2,0.2
33,5.2,4.1,1.5,0.1


In [31]:
X_train_fs[:5,:]

array([[6.3, 5. , 1.9],
       [5. , 1.6, 0.4],
       [6.3, 4.4, 1.3],
       [5.8, 1.2, 0.2],
       [5.2, 1.5, 0.1]])

In [32]:
rcv = RandomizedSearchCV(RandomForestClassifier(),
                  {'n_estimators': range(10,100, 10),
                   'min_samples_split': uniform()
                  },
                  )

In [33]:
rcv = rcv.fit(X_train, y_train)



In [34]:
pd.DataFrame(data=rcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.089775,0.002026,0.011413,0.000145,0.797978,60,"{'min_samples_split': 0.7979781689491019, 'n_e...",0.333333,0.351351,0.333333,0.339286,0.008475,6
1,0.036712,0.006261,0.005339,6.9e-05,0.0803024,20,"{'min_samples_split': 0.08030235307295386, 'n_...",0.923077,1.0,0.944444,0.955357,0.03255,2
2,0.13299,0.000605,0.015952,0.000232,0.138136,90,"{'min_samples_split': 0.13813596522113558, 'n_...",0.923077,1.0,0.916667,0.946429,0.037718,4
3,0.122691,0.020552,0.013389,0.000638,0.39023,70,"{'min_samples_split': 0.3902302126813324, 'n_e...",0.923077,1.0,0.916667,0.946429,0.037718,4
4,0.103746,0.000739,0.013049,0.000313,0.342953,70,"{'min_samples_split': 0.3429533169469444, 'n_e...",0.923077,1.0,0.972222,0.964286,0.032139,1
5,0.086185,0.00066,0.011194,0.000384,0.000305664,60,"{'min_samples_split': 0.000305664441768716, 'n...",0.923077,0.972973,0.972222,0.955357,0.023596,2
6,0.031317,0.000273,0.005265,4.9e-05,0.996255,20,"{'min_samples_split': 0.9962552440904059, 'n_e...",0.333333,0.324324,0.333333,0.330357,0.004237,8
7,0.088975,0.001877,0.011478,0.00017,0.817627,60,"{'min_samples_split': 0.8176273047366948, 'n_e...",0.333333,0.351351,0.333333,0.339286,0.008475,6
8,0.112198,0.002311,0.01391,0.000739,0.86049,80,"{'min_samples_split': 0.86048953404825, 'n_est...",0.333333,0.324324,0.333333,0.330357,0.004237,8
9,0.089002,0.002441,0.01163,0.000345,0.816013,60,"{'min_samples_split': 0.8160133572388177, 'n_e...",0.333333,0.324324,0.333333,0.330357,0.004237,8


In [35]:
rfc = rcv.best_estimator_

In [36]:
y_predict = rfc.predict(X_test)
accuracy_score(y_test, y_predict)

0.9473684210526315

# Exercise 1

Run feature selection and hyperparameter optimization with randomized search cross validation on the [crowdsource mapping](https://archive.ics.uci.edu/ml/datasets/Crowdsourced+Mapping) dataset using a SVM

In [None]:
#imports

In [None]:
# load the dataset
train_df = pd.read_csv(#fill in the path to file)
test_df = pd.read_csv(#fill in the path to file)


In [None]:
#slice out the features
X_train = train_df[]
X_test = test_df[]

y_train = train_df[]
y_test = test_df[]

In [None]:
#feature selection

In [None]:
rcv = RandomizedSearchCV(
#classifier
#dict containing parameters and ranges
)

In [None]:
svc = rcv.best_estimator_
y_pred = svc.predict(X_test)
classification_report(y_test, y_pred)

# Exercise 2

Run feature selection and hyperparameter optimization with randomized search cross validation on the Air Quality dataset using a SVM

In [None]:
#imports

In [None]:
# load the dataset
train_df = pd.read_csv(#fill in the path to file)
test_df = pd.read_csv(#fill in the path to file)


In [None]:
#slice out the features
X_train = train_df[]
X_test = test_df[]

y_train = train_df[]
y_test = test_df[]

In [None]:
#feature selection

In [None]:
rcv = RandomizedSearchCV(
#classifier
#dict containing parameters and ranges
)

In [None]:
svc = rcv.best_estimator_
y_pred = svc.predict(X_test)
classification_report(y_test, y_pred)