In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2, SelectKBest
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score, classification_report
from scipy.stats.distributions import uniform
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Regression

[Dataset](https://archive.ics.uci.edu/ml/datasets/air+quality)

In [2]:
df = pd.read_excel('~/DATA/AirQualityUCI.xlsx')

In [3]:
df.shape

(9357, 15)

In [4]:
df.dtypes

Date             datetime64[ns]
Time                     object
CO(GT)                  float64
PT08.S1(CO)             float64
NMHC(GT)                  int64
C6H6(GT)                float64
PT08.S2(NMHC)           float64
NOx(GT)                 float64
PT08.S3(NOx)            float64
NO2(GT)                 float64
PT08.S4(NO2)            float64
PT08.S5(O3)             float64
T                       float64
RH                      float64
AH                      float64
dtype: object

In [5]:
df = df.drop(['AH', 'Date', 'Time'], axis=1)

In [6]:
df.columns

Index(['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
       'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
       'T', 'RH'],
      dtype='object')

In [7]:
df_train, df_test = train_test_split(df)

In [25]:
type(df_train)

pandas.core.frame.DataFrame

# Variance Inflation Factor

In [8]:
X = df_train.drop('RH', axis=1)

In [9]:
X = add_constant(X)
vifs = pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)
vifs

  return ptp(axis=axis, out=out, **kwargs)


const            364.222373
CO(GT)             1.855054
PT08.S1(CO)       21.301238
NMHC(GT)           1.506306
C6H6(GT)          91.172995
PT08.S2(NMHC)     21.751344
NOx(GT)            6.093490
PT08.S3(NOx)       6.627849
NO2(GT)            4.613600
PT08.S4(NO2)       9.336223
PT08.S5(O3)       11.153702
T                 52.473363
dtype: float64

In [10]:
vifs[1:].max()

91.1729952742211

In [11]:
while vifs[1:].max() > 5:
    print(vifs[1:].idxmax())
    X.drop(
        vifs[1:].idxmax(),
        axis=1,
        inplace=True
    )
    vifs = pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

X.head()    

C6H6(GT)
PT08.S2(NMHC)
PT08.S1(CO)
T
NOx(GT)


Unnamed: 0,const,CO(GT),NMHC(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3)
334,1.0,2.7,230,828.75,126.0,1729.5,1002.25
7987,1.0,1.2,-200,981.25,134.5,790.25,557.5
1999,1.0,-200.0,-200,958.0,49.0,1692.0,942.25
6123,1.0,1.2,-200,854.5,159.0,999.25,1230.0
6641,1.0,2.4,-200,750.5,116.0,1084.5,1208.25


In [12]:
vifs

const           20.813413
CO(GT)           1.838591
NMHC(GT)         1.054263
PT08.S3(NOx)     1.277139
NO2(GT)          2.077360
PT08.S4(NO2)     2.669704
PT08.S5(O3)      2.836017
dtype: float64

In [13]:
vif_keep_cols = ['RH'] + list(X.drop('const', axis=1).columns)

In [14]:
vif_keep_cols

['RH',
 'CO(GT)',
 'NMHC(GT)',
 'PT08.S3(NOx)',
 'NO2(GT)',
 'PT08.S4(NO2)',
 'PT08.S5(O3)']

# Correlation

In [15]:
corrs = df_train.corr()['RH'].abs().sort_values(ascending=False)
corrs

RH               1.000000
C6H6(GT)         0.922151
T                0.880627
PT08.S1(CO)      0.739673
PT08.S4(NO2)     0.631788
PT08.S2(NMHC)    0.577291
PT08.S3(NOx)     0.567104
PT08.S5(O3)      0.517458
NO2(GT)          0.087954
NOx(GT)          0.063646
CO(GT)           0.048942
NMHC(GT)         0.006986
Name: RH, dtype: float64

In [16]:
corr_keep_cols = corrs.index[:8]
corr_keep_cols

Index(['RH', 'C6H6(GT)', 'T', 'PT08.S1(CO)', 'PT08.S4(NO2)', 'PT08.S2(NMHC)',
       'PT08.S3(NOx)', 'PT08.S5(O3)'],
      dtype='object')

# Randomized Search CV

In [17]:
X_train = df_train[vif_keep_cols].iloc[:,1:]
y_train = df_train[vif_keep_cols].iloc[:,0]
X_train.columns

Index(['CO(GT)', 'NMHC(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)',
       'PT08.S5(O3)'],
      dtype='object')

In [18]:
X_test = df_test[vif_keep_cols].iloc[:,1:]
y_test = df_test[vif_keep_cols].iloc[:,0]
y_test.name

'RH'

In [19]:
rcv = RandomizedSearchCV(RandomForestRegressor(),
                  {'n_estimators': range(10,100, 10),
                   'min_samples_split': uniform()
                  },
                  scoring=make_scorer(mean_squared_error))

In [20]:
rcv = rcv.fit(X_train, y_train)



In [21]:
pd.DataFrame(data=rcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.118396,0.009259,0.005746,0.000119,0.555144,20,"{'min_samples_split': 0.5551441102247099, 'n_e...",282.294235,274.655754,283.806637,280.252208,4.005169,6
1,0.57804,0.009983,0.016615,0.000437,0.203818,70,"{'min_samples_split': 0.20381778343115864, 'n_...",250.577789,240.60584,247.690752,246.29146,4.189547,9
2,0.510029,0.006338,0.017264,0.002597,0.481191,80,"{'min_samples_split': 0.4811908537904537, 'n_e...",269.046475,264.04147,267.562508,266.883484,2.09894,8
3,0.031226,0.000104,0.004932,7.3e-05,0.911181,20,"{'min_samples_split': 0.9111806858065793, 'n_e...",2485.277753,2699.05619,2417.873075,2534.069006,119.864996,1
4,0.045896,0.001905,0.006245,8e-05,0.803161,30,"{'min_samples_split': 0.8031607218928434, 'n_e...",2485.305837,2699.709852,2416.987239,2534.000976,120.447586,2
5,0.058878,0.001851,0.007142,0.000155,0.946602,40,"{'min_samples_split': 0.9466022332961608, 'n_e...",2485.343113,2698.943137,2417.002902,2533.76305,120.085919,4
6,0.017308,0.000397,0.003536,8.4e-05,0.842464,10,"{'min_samples_split': 0.8424636553333554, 'n_e...",2485.169545,2698.56054,2416.814469,2533.514851,119.994899,5
7,0.386741,0.006986,0.01212,0.00018,0.106784,40,"{'min_samples_split': 0.10678360399740627, 'n_...",233.435417,219.716196,230.067616,227.739743,5.837722,10
8,0.097603,0.003387,0.012486,0.002029,0.693238,70,"{'min_samples_split': 0.6932376902924939, 'n_e...",2485.371211,2699.086401,2417.194993,2533.884202,120.085592,3
9,0.525183,0.010234,0.016442,0.000301,0.435826,80,"{'min_samples_split': 0.43582614812607257, 'n_...",270.214683,263.889384,267.482077,267.195381,2.590238,7


In [22]:
rfr = rcv.best_estimator_

In [23]:
y_predict = rfr.predict(X_test)
mean_squared_error(y_test, y_predict)

2891.1932447866793

In [24]:
type(y_predict)

numpy.ndarray

# Classification

In [27]:
df = pd.read_csv('~/DATA/Iris.csv', index_col=0)

In [28]:
df.shape

(150, 5)

In [29]:
df.dtypes

SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1])

# Chi Squared

In [31]:
chi_best = SelectKBest(chi2, k=3)
X_train_fs = chi_best.fit_transform(X_train, y_train)
X_test_fs = chi_best.transform(X_test)

In [34]:
X_train_fs.shape

(112, 3)

In [29]:
chi_best.pvalues_

array([3.10362864e-02, 2.13755627e-01, 5.12881988e-19, 1.76045283e-11])

In [35]:
chi_best.pvalues_ <= 0.05

array([ True, False,  True,  True])

In [38]:
X_train[X_train.columns[chi_best.pvalues_ <= 0.05]].head()

Unnamed: 0_level_0,SepalLengthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
141,6.7,5.6,2.4
114,5.7,5.0,2.0
8,5.0,1.5,0.2
44,5.0,1.6,0.6
138,6.4,5.5,1.8


In [30]:
X_train.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
147,6.3,2.5,5.0,1.9
27,5.0,3.4,1.6,0.4
88,6.3,2.3,4.4,1.3
15,5.8,4.0,1.2,0.2
33,5.2,4.1,1.5,0.1


In [31]:
X_train_fs[:5,:]

array([[6.3, 5. , 1.9],
       [5. , 1.6, 0.4],
       [6.3, 4.4, 1.3],
       [5.8, 1.2, 0.2],
       [5.2, 1.5, 0.1]])

In [32]:
rcv = RandomizedSearchCV(RandomForestClassifier(),
                  {'n_estimators': range(10,100, 10),
                   'min_samples_split': uniform()
                  },
                  )

In [33]:
rcv = rcv.fit(X_train, y_train)



In [34]:
pd.DataFrame(data=rcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.089775,0.002026,0.011413,0.000145,0.797978,60,"{'min_samples_split': 0.7979781689491019, 'n_e...",0.333333,0.351351,0.333333,0.339286,0.008475,6
1,0.036712,0.006261,0.005339,6.9e-05,0.0803024,20,"{'min_samples_split': 0.08030235307295386, 'n_...",0.923077,1.0,0.944444,0.955357,0.03255,2
2,0.13299,0.000605,0.015952,0.000232,0.138136,90,"{'min_samples_split': 0.13813596522113558, 'n_...",0.923077,1.0,0.916667,0.946429,0.037718,4
3,0.122691,0.020552,0.013389,0.000638,0.39023,70,"{'min_samples_split': 0.3902302126813324, 'n_e...",0.923077,1.0,0.916667,0.946429,0.037718,4
4,0.103746,0.000739,0.013049,0.000313,0.342953,70,"{'min_samples_split': 0.3429533169469444, 'n_e...",0.923077,1.0,0.972222,0.964286,0.032139,1
5,0.086185,0.00066,0.011194,0.000384,0.000305664,60,"{'min_samples_split': 0.000305664441768716, 'n...",0.923077,0.972973,0.972222,0.955357,0.023596,2
6,0.031317,0.000273,0.005265,4.9e-05,0.996255,20,"{'min_samples_split': 0.9962552440904059, 'n_e...",0.333333,0.324324,0.333333,0.330357,0.004237,8
7,0.088975,0.001877,0.011478,0.00017,0.817627,60,"{'min_samples_split': 0.8176273047366948, 'n_e...",0.333333,0.351351,0.333333,0.339286,0.008475,6
8,0.112198,0.002311,0.01391,0.000739,0.86049,80,"{'min_samples_split': 0.86048953404825, 'n_est...",0.333333,0.324324,0.333333,0.330357,0.004237,8
9,0.089002,0.002441,0.01163,0.000345,0.816013,60,"{'min_samples_split': 0.8160133572388177, 'n_e...",0.333333,0.324324,0.333333,0.330357,0.004237,8


In [35]:
rfc = rcv.best_estimator_

In [36]:
y_predict = rfc.predict(X_test)
accuracy_score(y_test, y_predict)

0.9473684210526315

# Exercise 1

Run feature selection and hyperparameter optimization with randomized search cross validation on the [crowdsource mapping](https://archive.ics.uci.edu/ml/datasets/Crowdsourced+Mapping) dataset using a SVM

In [66]:
#imports
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [42]:
# load the dataset
train_df = pd.read_csv('~/DATA/crowdsourcemapping/training.csv')
test_df = pd.read_csv('~/DATA/crowdsourcemapping/testing.csv')


In [43]:
train_df.head()

Unnamed: 0,class,max_ndvi,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,water,997.904,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,-1628.24,...,-921.193,-1043.16,-1942.49,267.138,366.608,452.238,211.328,-2203.02,-1180.19,433.906
1,water,914.198,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,-1670.59,...,-954.719,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,water,3800.81,1671.34,1206.88,449.735,1071.21,546.371,1077.84,214.564,849.599,...,1562.21,1566.16,2208.44,1056.6,385.203,300.56,293.73,2762.57,150.931,3800.81
3,water,952.178,58.0174,-1599.16,210.714,-1052.63,578.807,-1564.63,-858.39,729.79,...,-1025.88,368.622,-1786.95,-1227.8,304.621,291.336,369.214,-2202.12,600.359,-1343.55
4,water,1232.12,72.518,-1220.88,380.436,-1256.93,515.805,-1413.18,-802.942,683.254,...,-1813.95,155.624,-1189.71,-924.073,432.15,282.833,298.32,-2197.36,626.379,-826.727


In [67]:
#slice out the features
X_train = train_df.iloc[:,1:]
X_test = test_df.iloc[:,1:]

y_train = train_df['class']
y_test = test_df['class']

sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [68]:
#feature selection

chi2(X_train, y_train)

(array([367.58193535, 271.37774672, 171.87149825, 170.38374642,
        157.25618596,  83.9754743 ,  75.0607771 , 102.20121996,
        179.64352103, 102.56703646, 221.08367533,  27.73044589,
         45.41983282,  17.91542801, 115.5892247 ,  38.85080943,
         27.92164276, 135.28222524, 105.59293353,  79.88997721,
         25.1588371 , 111.53295566,  37.33652544,  66.77363509,
        226.61667447, 115.53527185,  86.52615765,  36.69009579]),
 array([2.86408951e-77, 1.41593489e-56, 2.90911725e-35, 6.04261159e-35,
        3.80401260e-32, 1.23431381e-16, 9.03518416e-15, 1.81540989e-20,
        6.37636839e-37, 1.51992176e-20, 8.70609552e-46, 4.10915602e-05,
        1.19193687e-08, 3.05428578e-03, 2.69485698e-23, 2.54482563e-07,
        3.77052372e-05, 1.79933784e-27, 3.49410686e-21, 8.84857170e-16,
        1.29830418e-04, 1.94302494e-22, 5.12747475e-07, 4.80150720e-13,
        5.67946563e-47, 2.76663909e-23, 3.60240337e-17, 6.91024593e-07]))

In [73]:
rcv = RandomizedSearchCV(
    SVC(),
    {'C': uniform(loc=1, scale=100)},
    n_iter=1
)
rcv.fit(X_train, y_train)



RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=SVC(C=1.0, cache_size=200, class_weight=None,
                                 coef0=0.0, decision_function_shape='ovr',
                                 degree=3, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, probability=False,
                                 random_state=None, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='warn', n_iter=1, n_jobs=None,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7e31a82f28>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [75]:
svc = rcv.best_estimator_
y_pred = svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        farm       0.59      0.68      0.63        53
      forest       0.48      0.76      0.58        78
       grass       0.46      0.36      0.41        36
  impervious       0.80      0.93      0.86        40
     orchard       1.00      0.11      0.19        47
       water       0.97      0.76      0.85        46

    accuracy                           0.62       300
   macro avg       0.72      0.60      0.59       300
weighted avg       0.70      0.62      0.59       300



# Exercise 2

Run feature selection and hyperparameter optimization with randomized search cross validation on the Air Quality dataset using a SVM

In [None]:
#imports

In [None]:
# load the dataset
df = pd.read_csv(#fill in the path to file)


In [None]:
#slice out the features
X_train = train_df[]
X_test = test_df[]

y_train = train_df[]
y_test = test_df[]

In [None]:
#feature selection

In [None]:
rcv = RandomizedSearchCV(
#regressor
#dict containing parameters and ranges
)

In [None]:
svc = rcv.best_estimator_
y_pred = svc.predict(X_test)
mean_squared_error(y_test, y_pred)