In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression, f_regression
from sklearn.ensemble import RandomForestRegressor

In [5]:
#EasyPark data
data = pd.read_csv('easypark_data.csv')
data = data[data.columns.values[2:]]

data.head()

Unnamed: 0,Smart Parking,Car Sharing Services,Traffic,Public Transport,E-Charge Spots,Infrastructure Investment,Clean Energy,Smart Building,Waste Disposal,Environment Protection,...,Blockchain ecosystem,4G LTE,Internet speed,Wifi Hotspots,Smartphone Penetration,Cyber security,Living Standard,Expert Perception,Rank/Score,IMD Smart City Index
0,8.22,2.06,8.71,4.18,2.51,7.88,8.54,6.75,8.25,8.71,...,1.31,8.26,7.96,1.91,9.69,7.2,8.11,6.57,6.94,
1,8.5,3.34,8.71,2.44,2.59,7.88,8.54,6.75,8.25,7.66,...,1.31,6.14,5.08,2.36,9.69,7.2,7.2,6.57,6.42,
2,6.44,4.93,9.02,9.02,6.37,8.03,1.15,2.36,2.6,2.74,...,2.17,5.69,1.98,6.75,9.31,2.13,6.22,9.47,5.6,56.0
3,5.13,3.27,6.67,9.55,5.76,7.5,2.77,6.29,4.2,2.59,...,2.64,6.45,2.36,4.93,4.97,9.62,7.58,9.16,6.07,
4,9.91,8.87,8.87,3.34,8.26,5.99,1.92,6.82,8.93,4.93,...,9.22,8.18,6.75,6.97,6.49,7.28,7.43,10.0,7.55,11.0


## Mutual Information Based Feature selection

In [8]:
X = data[data.columns.values[:-3]].values
expert_Y = data[data.columns.values[-3]]

In [10]:
#Information gain based feature selection
mi = mutual_info_regression(X, expert_Y)

In [12]:
mi_per_feature = pd.Series(mi, index=data.columns.values[:-3])
mi_per_feature

Smart Parking                      0.057199
Car Sharing Services               0.069978
Traffic                            0.000909
Public Transport                   0.001506
E-Charge Spots                     0.165223
Infrastructure Investment          0.000000
Clean Energy                       0.028427
Smart Building                     0.096206
Waste Disposal                     0.042309
Environment Protection             0.138916
Environmental Performance Index    0.060235
Citizen Participation              0.197423
Digitalization of Government       0.042725
Urban Planning                     0.000000
Education                          0.050339
Business Ecosystem                 0.051974
Blockchain ecosystem               0.043195
4G LTE                             0.000000
Internet speed                     0.000000
Wifi Hotspots                      0.128628
Smartphone Penetration             0.163162
Cyber security                     0.116341
Living Standard                 

In [15]:
mi_per_feature[mi_per_feature>0.1]

E-Charge Spots            0.165223
Environment Protection    0.138916
Citizen Participation     0.197423
Wifi Hotspots             0.128628
Smartphone Penetration    0.163162
Cyber security            0.116341
dtype: float64

## F test based feature selection

In [18]:
#Information gain based feature selection
f_score = f_regression(X, expert_Y)

In [19]:
f_score

(array([9.70188433e-01, 1.51422069e+01, 6.69467939e+00, 6.98686251e-01,
        1.49805772e+01, 7.04729066e-03, 3.01966248e+00, 3.28020817e+00,
        1.49809184e+00, 1.95683076e+01, 7.61995575e+00, 1.71219701e-01,
        3.47701769e-01, 4.11909273e+00, 1.68023005e+01, 1.13445774e+01,
        2.43170793e+00, 5.71520498e-01, 1.57927023e-01, 1.55952242e+01,
        5.13141735e-01, 3.83933876e-01, 7.80151800e-03]),
 array([3.27058772e-01, 1.81874750e-04, 1.11381183e-02, 4.05258872e-01,
        1.95874809e-04, 9.33268908e-01, 8.54019180e-02, 7.31835770e-02,
        2.23899676e-01, 2.51694284e-05, 6.89118362e-03, 6.79934247e-01,
        5.56774043e-01, 4.51128509e-02, 8.56164113e-05, 1.08272765e-03,
        1.22126933e-01, 4.51468727e-01, 6.91936509e-01, 1.47856372e-04,
        4.75485567e-01, 5.36943347e-01, 9.29797658e-01]))

## Random Forest Regressor feature importance test

In [28]:
reg = RandomForestRegressor()
reg.fit(X, expert_Y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [29]:
feature_importance = pd.Series(reg.feature_importances_, index=data.columns.values[:-3])
feature_importance[feature_importance > 0.05]

Car Sharing Services      0.065866
E-Charge Spots            0.103058
Environment Protection    0.089759
Urban Planning            0.051025
Wifi Hotspots             0.081438
Living Standard           0.053054
dtype: float64