In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import sweetviz
from sklearn.model_selection import train_test_split
from feature_engine.outliers import Winsorizer

In [2]:
df = pd.read_csv('nyc-east-river-bicycle-counts.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Day,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total
0,0,2016-04-01 00:00:00,2016-04-01 00:00:00,78.1,66.0,0.01,1704.0,3126,4115.0,2552.0,11497
1,1,2016-04-02 00:00:00,2016-04-02 00:00:00,55.0,48.9,0.15,827.0,1646,2565.0,1884.0,6922
2,2,2016-04-03 00:00:00,2016-04-03 00:00:00,39.9,34.0,0.09,526.0,1232,1695.0,1306.0,4759
3,3,2016-04-04 00:00:00,2016-04-04 00:00:00,44.1,33.1,0.47 (S),521.0,1067,1440.0,1307.0,4335
4,4,2016-04-05 00:00:00,2016-04-05 00:00:00,42.1,26.1,0,1416.0,2617,3081.0,2357.0,9471


In [3]:
#EDA
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           210 non-null    int64  
 1   Date                 210 non-null    object 
 2   Day                  210 non-null    object 
 3   High Temp (°F)       210 non-null    float64
 4   Low Temp (°F)        210 non-null    float64
 5   Precipitation        210 non-null    object 
 6   Brooklyn Bridge      210 non-null    float64
 7   Manhattan Bridge     210 non-null    int64  
 8   Williamsburg Bridge  210 non-null    float64
 9   Queensboro Bridge    210 non-null    float64
 10  Total                210 non-null    int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 18.2+ KB


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,High Temp (°F),Low Temp (°F),Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total
count,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0
mean,104.5,60.58,46.413333,2269.633333,4049.533333,4862.466667,3352.866667,14534.5
std,60.765944,11.183223,9.522796,981.237786,1704.731356,1814.039499,1099.254419,5569.173496
min,0.0,39.9,26.1,504.0,997.0,1440.0,1306.0,4335.0
25%,52.25,55.0,44.1,1447.0,2617.0,3282.0,2457.0,9596.0
50%,104.5,62.1,46.9,2379.5,4165.0,5194.0,3477.0,15292.5
75%,156.75,68.0,50.0,3147.0,5309.0,6030.0,4192.0,18315.0
max,209.0,81.0,66.0,3871.0,6951.0,7834.0,5032.0,23318.0


In [5]:
#feature engineering
X = df.drop(['Unnamed: 0','Date','Day','Total'], axis = 1)
y = df['Total']

In [6]:
X.isna().sum()

High Temp (°F)         0
Low Temp (°F)          0
Precipitation          0
Brooklyn Bridge        0
Manhattan Bridge       0
Williamsburg Bridge    0
Queensboro Bridge      0
dtype: int64

In [7]:
X.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
205     True
206     True
207     True
208     True
209     True
Length: 210, dtype: bool

In [8]:
numeric_features = X.select_dtypes(exclude = ['object']).columns
categorical_columns = X.select_dtypes(include = ['object']).columns


In [9]:
#data preprocessing
#outliers
# Specify the columns as a list
variables = ['High Temp (°F)', 'Low Temp (°F)', 'Brooklyn Bridge',
             'Manhattan Bridge', 'Williamsburg Bridge', 'Queensboro Bridge']

# Pass the list to the Winsorizer
winsor = Winsorizer(capping_method='iqr',
                    tail='both',
                    fold=1.5,
                    variables=variables)


In [10]:
#imputation
num_pipeline = Pipeline(steps = [('Impute', SimpleImputer(strategy = 'mean'))])
outlier_pipeline = Pipeline(steps = [('winsor', winsor)])

In [11]:
#OneHotEncoding
from sklearn.preprocessing import OneHotEncoder
encode_pipeline = Pipeline(steps = [('OneHot', OneHotEncoder(sparse_output=False, drop = 'first'))])

In [12]:
num_preprocessor = ColumnTransformer(transformers = [('num', num_pipeline, numeric_features)])

In [13]:
win_preprocessor = ColumnTransformer(transformers = [('wins', outlier_pipeline, numeric_features)])

In [14]:
encode_preprocessor = ColumnTransformer(transformers = [('encode', encode_pipeline, categorical_columns)])

In [15]:
impute_data = num_preprocessor.fit_transform(X)
df1 = pd.DataFrame(impute_data, columns = numeric_features)
df1

Unnamed: 0,High Temp (°F),Low Temp (°F),Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge
0,78.1,66.0,1704.0,3126.0,4115.0,2552.0
1,55.0,48.9,827.0,1646.0,2565.0,1884.0
2,39.9,34.0,526.0,1232.0,1695.0,1306.0
3,44.1,33.1,521.0,1067.0,1440.0,1307.0
4,42.1,26.1,1416.0,2617.0,3081.0,2357.0
...,...,...,...,...,...,...
205,60.1,46.9,1997.0,3520.0,4559.0,2929.0
206,62.1,46.9,3343.0,5606.0,6577.0,4388.0
207,57.9,48.0,2486.0,4152.0,5336.0,3657.0
208,57.0,46.9,2375.0,4178.0,5053.0,3348.0


In [16]:
wins_data = win_preprocessor.fit_transform(df1)
df2 = pd.DataFrame(wins_data, columns = df1.columns)
df2

Unnamed: 0,High Temp (°F),Low Temp (°F),Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge
0,78.1,58.85,1704.0,3126.0,4115.0,2552.0
1,55.0,48.90,827.0,1646.0,2565.0,1884.0
2,39.9,35.25,526.0,1232.0,1695.0,1306.0
3,44.1,35.25,521.0,1067.0,1440.0,1307.0
4,42.1,35.25,1416.0,2617.0,3081.0,2357.0
...,...,...,...,...,...,...
205,60.1,46.90,1997.0,3520.0,4559.0,2929.0
206,62.1,46.90,3343.0,5606.0,6577.0,4388.0
207,57.9,48.00,2486.0,4152.0,5336.0,3657.0
208,57.0,46.90,2375.0,4178.0,5053.0,3348.0


In [17]:
encode_data = encode_preprocessor.fit_transform(X[categorical_columns])

# Retrieve feature names and create the DataFrame
columns = encode_preprocessor.get_feature_names_out(categorical_columns)
df3 = pd.DataFrame(encode_data, columns=columns)
df3

Unnamed: 0,encode__Precipitation_0.01,encode__Precipitation_0.05,encode__Precipitation_0.09,encode__Precipitation_0.15,encode__Precipitation_0.16,encode__Precipitation_0.2,encode__Precipitation_0.24,encode__Precipitation_0.47 (S),encode__Precipitation_T
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
205,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.preprocessing import MinMaxScaler
scale_pipeline = Pipeline(steps = [('minmax', MinMaxScaler())])
scale_preprocessor = ColumnTransformer(transformers = [('minmax', scale_pipeline, numeric_features)])

In [19]:
scale_data = scale_preprocessor.fit_transform(df2)
df4 = pd.DataFrame(scale_data, columns = df2.columns)
df4

Unnamed: 0,High Temp (°F),Low Temp (°F),Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge
0,0.929440,1.000000,0.356400,0.357575,0.418361,0.334407
1,0.367397,0.578390,0.095931,0.109002,0.175946,0.155126
2,0.000000,0.000000,0.006534,0.039469,0.039881,0.000000
3,0.102190,0.000000,0.005049,0.011757,0.000000,0.000268
4,0.053528,0.000000,0.270864,0.272086,0.256647,0.282072
...,...,...,...,...,...,...
205,0.491484,0.493644,0.443421,0.423749,0.487801,0.435588
206,0.540146,0.493644,0.843184,0.774101,0.803409,0.827160
207,0.437956,0.540254,0.588655,0.529896,0.609321,0.630972
208,0.416058,0.493644,0.555688,0.534263,0.565061,0.548041


In [20]:
clean_df = pd.concat([df4, df3], axis=1)  # Use axis=1 to concatenate columns side by side
clean_df


Unnamed: 0,High Temp (°F),Low Temp (°F),Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,encode__Precipitation_0.01,encode__Precipitation_0.05,encode__Precipitation_0.09,encode__Precipitation_0.15,encode__Precipitation_0.16,encode__Precipitation_0.2,encode__Precipitation_0.24,encode__Precipitation_0.47 (S),encode__Precipitation_T
0,0.929440,1.000000,0.356400,0.357575,0.418361,0.334407,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.367397,0.578390,0.095931,0.109002,0.175946,0.155126,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.006534,0.039469,0.039881,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.102190,0.000000,0.005049,0.011757,0.000000,0.000268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.053528,0.000000,0.270864,0.272086,0.256647,0.282072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,0.491484,0.493644,0.443421,0.423749,0.487801,0.435588,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
206,0.540146,0.493644,0.843184,0.774101,0.803409,0.827160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207,0.437956,0.540254,0.588655,0.529896,0.609321,0.630972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,0.416058,0.493644,0.555688,0.534263,0.565061,0.548041,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#data Analization after pre-processing
X[numeric_features].skew()

High Temp (°F)        -0.175504
Low Temp (°F)         -0.037792
Brooklyn Bridge       -0.315446
Manhattan Bridge      -0.108822
Williamsburg Bridge   -0.258221
Queensboro Bridge     -0.291826
dtype: float64

In [22]:
#kurtosis
X[numeric_features].kurt()

High Temp (°F)        -0.800185
Low Temp (°F)         -0.052755
Brooklyn Bridge       -0.999362
Manhattan Bridge      -0.899696
Williamsburg Bridge   -0.828972
Queensboro Bridge     -0.947052
dtype: float64

In [23]:
#corelation
X[numeric_features].corr()

Unnamed: 0,High Temp (°F),Low Temp (°F),Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge
High Temp (°F),1.0,0.823853,0.739377,0.720175,0.764336,0.727825
Low Temp (°F),0.823853,1.0,0.46455,0.465097,0.534212,0.475725
Brooklyn Bridge,0.739377,0.46455,1.0,0.983148,0.980463,0.976991
Manhattan Bridge,0.720175,0.465097,0.983148,1.0,0.989805,0.985867
Williamsburg Bridge,0.764336,0.534212,0.980463,0.989805,1.0,0.988852
Queensboro Bridge,0.727825,0.475725,0.976991,0.985867,0.988852,1.0


In [24]:
auto_df = sweetviz.analyze(df)
auto_df.show_html('report.html')

                                             |                                             | [  0%]   00:00 ->…

Report report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [25]:
#After data preprocessing
auto_df = sweetviz.analyze(clean_df)
auto_df.show_html('report.html')

                                             |                                             | [  0%]   00:00 ->…

Report report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
