In [1]:
import pandas as pd

# Missing Value Ratio

In [2]:
df = pd.read_csv("Dataset/Dimensionality_Reduction/missing_value_ratio.csv")
df.head()

Unnamed: 0,ID,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,AB101,1.0,0.0,0.0,1.0,9.84,14.395,81.0,,16
1,AB102,1.0,,0.0,,9.02,13.635,80.0,,40
2,AB103,1.0,0.0,,1.0,9.02,13.635,80.0,,32
3,AB104,,0.0,,1.0,9.84,14.395,75.0,,13
4,AB105,1.0,,0.0,,9.84,14.395,,16.9979,1


In [7]:
# percentage of missing values in each feature
null_stats = round((df.isnull().sum() / df.shape[0] )*100,2)
null_stats

ID             0.00
season         0.07
holiday       48.50
workingday     0.07
weather        0.03
temp           0.00
atemp          0.00
humidity       0.04
windspeed     41.02
count          0.00
dtype: float64

> 40 % missing values are still not a huge number, we could handle them, but for demonstration purpose, we will proceed ahead with this dataset

In [16]:
# list of variables with nulls below threshold
variables_below_threshold_list = []
null_threshold = 40
for var in range(df.shape[1]):
    if null_stats[var] <= null_threshold:
        variables_below_threshold_list.append(df.columns[var])

In [17]:
variables_below_threshold_list

['ID', 'season', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'count']

In [19]:
# Generating new dataframe with values below threshold
new_df = df[variables_below_threshold_list]
new_df.head()

Unnamed: 0,ID,season,workingday,weather,temp,atemp,humidity,count
0,AB101,1.0,0.0,1.0,9.84,14.395,81.0,16
1,AB102,1.0,0.0,,9.02,13.635,80.0,40
2,AB103,1.0,,1.0,9.02,13.635,80.0,32
3,AB104,,,1.0,9.84,14.395,75.0,13
4,AB105,1.0,0.0,,9.84,14.395,,1


In [20]:
# percentage of missing values in each feature
new_null_stats = round((new_df.isnull().sum() / new_df.shape[0] )*100,2)
new_null_stats

ID            0.00
season        0.07
workingday    0.07
weather       0.03
temp          0.00
atemp         0.00
humidity      0.04
count         0.00
dtype: float64

In [21]:
df.shape, new_df.shape

((12980, 10), (12980, 8))

# Low Variance Filter

In [34]:
import pandas as pd
from sklearn.preprocessing import normalize

In [35]:
df = pd.read_csv("Dataset/Dimensionality_Reduction/low_variance_filter.csv")
df.head()

Unnamed: 0,ID,temp,atemp,humidity,windspeed,count
0,AB101,9.84,14.395,81,0.0,16
1,AB102,9.02,13.635,80,0.0,40
2,AB103,9.02,13.635,80,0.0,32
3,AB104,9.84,14.395,75,0.0,13
4,AB105,9.84,14.395,75,0.0,1


In [36]:
# percentage of missing values in each feature
null_stats = round((df.isnull().sum() / df.shape[0] )*100,2)
null_stats

# nulls have been already handles in this data to demonstrate the variance concept

ID           0.0
temp         0.0
atemp        0.0
humidity     0.0
windspeed    0.0
count        0.0
dtype: float64

In [37]:
# as since ID is a index variable, and of no use for model, we are dropping it
df.drop(['ID'], axis=1, inplace=True)

In [38]:
normalize = normalize(df)
df_normalized = pd.DataFrame(normalize, columns=df.columns)
df_normalized.head()

Unnamed: 0,temp,atemp,humidity,windspeed,count
0,0.116607,0.170585,0.959872,0.0,0.189604
1,0.099203,0.14996,0.87985,0.0,0.439925
2,0.102851,0.155473,0.912202,0.0,0.364881
3,0.126009,0.184339,0.960431,0.0,0.166475
4,0.127781,0.186932,0.97394,0.0,0.012986


In [40]:
df_normalized.describe()

Unnamed: 0,temp,atemp,humidity,windspeed,count
count,12980.0,12980.0,12980.0,12980.0,12980.0
mean,0.133424,0.157703,0.484986,0.098251,0.69599
std,0.076665,0.089314,0.305763,0.093572,0.33463
min,0.002577,0.0,0.0,0.0,0.009497
25%,0.075355,0.089242,0.205452,0.036109,0.424225
50%,0.11601,0.138768,0.417778,0.073078,0.872426
75%,0.17495,0.20753,0.805784,0.132569,0.964887
max,0.5581,0.658868,0.991642,0.751237,0.998711


In [43]:
# calculating Variance
df_variance = df_normalized.var()
df_variance

temp         0.005877
atemp        0.007977
humidity     0.093491
windspeed    0.008756
count        0.111977
dtype: float64

In [46]:
# list of variables with nulls below threshold
variables_above_threshold_list = []
variance_threshold = 0.006
for var in range(df.shape[1]):
    if df_variance[var] >= variance_threshold:
        variables_above_threshold_list.append(df.columns[var])

In [47]:
variables_above_threshold_list

['atemp', 'humidity', 'windspeed', 'count']

In [49]:
new_df = df[variables_above_threshold_list]
new_df.head()

Unnamed: 0,atemp,humidity,windspeed,count
0,14.395,81,0.0,16
1,13.635,80,0.0,40
2,13.635,80,0.0,32
3,14.395,75,0.0,13
4,14.395,75,0.0,1


In [50]:
df.shape, new_df.shape

((12980, 5), (12980, 4))

# High Correlation Filter

In [51]:
import numpy as np
import pandas as pd

In [53]:
df = pd.read_csv("Dataset/Dimensionality_Reduction/high_correlation_fllter.csv")
df.head()

Unnamed: 0,ID,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,AB101,1,0,0,1,9.84,14.395,81,0.0,16
1,AB102,1,0,0,1,9.02,13.635,80,0.0,40
2,AB103,1,0,0,1,9.02,13.635,80,0.0,32
3,AB104,1,0,0,1,9.84,14.395,75,0.0,13
4,AB105,1,0,0,1,9.84,14.395,75,0.0,1


In [54]:
# percentage of missing values in each feature
null_stats = round((df.isnull().sum() / df.shape[0] )*100,2)
null_stats

# nulls have been already handles in this data to demonstrate the correlation concept

ID            0.0
season        0.0
holiday       0.0
workingday    0.0
weather       0.0
temp          0.0
atemp         0.0
humidity      0.0
windspeed     0.0
count         0.0
dtype: float64

In [63]:
# as we are calculating the correlation between the features first, we are dropping target feature
x_df = df.drop('count', axis=1)

# calculating correlation b/w each pair of features
corr_matrix = x_df.corr()
corr_matrix = corr_matrix.reset_index()
corr_matrix

Unnamed: 0,index,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,season,1.0,-0.010959,0.014343,-0.013005,0.39456,0.397765,0.181712,-0.135762
1,holiday,-0.010959,1.0,-0.248558,-0.018406,-0.025104,-0.032903,-0.02952,0.021646
2,workingday,0.014343,-0.248558,1.0,0.052788,0.060589,0.06484,0.028026,0.001986
3,weather,-0.013005,-0.018406,0.052788,1.0,-0.093655,-0.094877,0.432497,0.01112
4,temp,0.39456,-0.025104,0.060589,-0.093655,1.0,0.991839,-0.048478,-0.008669
5,atemp,0.397765,-0.032903,0.06484,-0.094877,0.991839,1.0,-0.031606,-0.049997
6,humidity,0.181712,-0.02952,0.028026,0.432497,-0.048478,-0.031606,1.0,-0.296975
7,windspeed,-0.135762,0.021646,0.001986,0.01112,-0.008669,-0.049997,-0.296975,1.0


In [67]:
corr_matrix.reset_index()
corr_matrix.melt(id_vars='index', value_vars = corr_matrix.columns.tolist(), value_name = 'correlation')

Unnamed: 0,index,variable,correlation
0,season,season,1.000000
1,holiday,season,-0.010959
2,workingday,season,0.014343
3,weather,season,-0.013005
4,temp,season,0.394560
...,...,...,...
59,weather,windspeed,0.011120
60,temp,windspeed,-0.008669
61,atemp,windspeed,-0.049997
62,humidity,windspeed,-0.296975


In [61]:
corr_matrix.reset_index()


Unnamed: 0,index,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,season,1.0,-0.010959,0.014343,-0.013005,0.39456,0.397765,0.181712,-0.135762
1,holiday,-0.010959,1.0,-0.248558,-0.018406,-0.025104,-0.032903,-0.02952,0.021646
2,workingday,0.014343,-0.248558,1.0,0.052788,0.060589,0.06484,0.028026,0.001986
3,weather,-0.013005,-0.018406,0.052788,1.0,-0.093655,-0.094877,0.432497,0.01112
4,temp,0.39456,-0.025104,0.060589,-0.093655,1.0,0.991839,-0.048478,-0.008669
5,atemp,0.397765,-0.032903,0.06484,-0.094877,0.991839,1.0,-0.031606,-0.049997
6,humidity,0.181712,-0.02952,0.028026,0.432497,-0.048478,-0.031606,1.0,-0.296975
7,windspeed,-0.135762,0.021646,0.001986,0.01112,-0.008669,-0.049997,-0.296975,1.0
