# 14 Scaling and Binding Numerical Data data

Required modules:
- pip install pandas
- pip install numpy
- pip install missingno
- pip install scipy
- pip install scikit-learn

In [1]:
import pandas as pd
import numpy as np

file = "../data/nyc_yellow_taxi_trip_selection.csv" # Takes about xx seconds...
#file = "../data/nyc_yellow_taxi_trip_records_from_Jan_to_Aug_2023.csv" # Huge takes almost 3 minutes to load

df = pd.read_csv(file, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"] )
#df = pd.read_csv(file, usecols=["VendorID","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","trip_distance","RatecodeID","PULocationID","DOLocationID","tip_amount","total_amount"], parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"] )
df = df.replace("NaN", np.nan)
df["passenger_count_corrected"] = df["passenger_count"] 
df["passenger_count_corrected"].fillna(df["passenger_count"].median(), inplace=True)

In [None]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected
0,9194462,9194462,2,2023-04-09 17:39:57,2023-04-09 18:13:31,2.0,9.81,1.0,N,132,...,0.0,0.5,3.0,0.0,1.0,51.25,0.0,1.75,,2.0
1,16987466,16987466,2,2023-01-26 14:47:44,2023-01-26 14:53:07,1.0,0.99,1.0,N,170,...,0.0,0.5,2.24,0.0,1.0,13.44,2.5,,0.0,1.0
2,6908899,6908899,2,2023-04-30 01:08:10,2023-04-30 01:20:46,2.0,2.85,1.0,N,114,...,1.0,0.5,4.12,0.0,1.0,24.72,2.5,0.0,,2.0
3,4617323,4617323,1,2023-05-19 17:10:40,2023-05-19 17:23:00,1.0,1.8,1.0,N,229,...,5.0,0.5,3.7,0.0,1.0,22.3,2.5,0.0,,1.0
4,14262454,14262454,2,2023-02-21 21:34:26,2023-02-21 21:38:01,1.0,1.23,1.0,N,48,...,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,,1.0


## Remove extreme values (outliers)


In [None]:
from scipy import stats
q_low = df["total_amount"].quantile(0.01)
q_hi  = df["total_amount"].quantile(0.9999)

df = df[(df["total_amount"] < q_hi) & (df["total_amount"] > q_low)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 989697 entries, 0 to 999999
Data columns (total 23 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Unnamed: 0.1               989697 non-null  int64         
 1   Unnamed: 0                 989697 non-null  int64         
 2   VendorID                   989697 non-null  int64         
 3   tpep_pickup_datetime       989697 non-null  datetime64[ns]
 4   tpep_dropoff_datetime      989697 non-null  datetime64[ns]
 5   passenger_count            962685 non-null  float64       
 6   trip_distance              989697 non-null  float64       
 7   RatecodeID                 962685 non-null  float64       
 8   store_and_fwd_flag         962685 non-null  object        
 9   PULocationID               989697 non-null  int64         
 10  DOLocationID               989697 non-null  int64         
 11  payment_type               989697 non-null  int64        

## Min-Max Scaling with: 
$$x' = \frac{x - \text{min}(x)}{\text{max}(x)-\text{min}(x)}

In [None]:
df["total_amount_minmax"] = (df.total_amount-df.total_amount.min()) / (df.total_amount.max()-df.total_amount.min())
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax
0,9194462,9194462,2,2023-04-09 17:39:57,2023-04-09 18:13:31,2.0,9.81,1.0,N,132,...,0.5,3.0,0.0,1.0,51.25,0.0,1.75,,2.0,0.144661
1,16987466,16987466,2,2023-01-26 14:47:44,2023-01-26 14:53:07,1.0,0.99,1.0,N,170,...,0.5,2.24,0.0,1.0,13.44,2.5,,0.0,1.0,0.027639
2,6908899,6908899,2,2023-04-30 01:08:10,2023-04-30 01:20:46,2.0,2.85,1.0,N,114,...,0.5,4.12,0.0,1.0,24.72,2.5,0.0,,2.0,0.06255
3,4617323,4617323,1,2023-05-19 17:10:40,2023-05-19 17:23:00,1.0,1.8,1.0,N,229,...,0.5,3.7,0.0,1.0,22.3,2.5,0.0,,1.0,0.05506
4,14262454,14262454,2,2023-02-21 21:34:26,2023-02-21 21:38:01,1.0,1.23,1.0,N,48,...,0.5,2.44,0.0,1.0,14.64,2.5,0.0,,1.0,0.031353


## Z-Score with
$$x' = \frac{x - \bar{x}}{\sigma}

In [None]:
df["total_amount_z"] =  (df.total_amount - df.total_amount.mean()) / df.total_amount.std()
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax,total_amount_z
0,9194462,9194462,2,2023-04-09 17:39:57,2023-04-09 18:13:31,2.0,9.81,1.0,N,132,...,3.0,0.0,1.0,51.25,0.0,1.75,,2.0,0.144661,1.02723
1,16987466,16987466,2,2023-01-26 14:47:44,2023-01-26 14:53:07,1.0,0.99,1.0,N,170,...,2.24,0.0,1.0,13.44,2.5,,0.0,1.0,0.027639,-0.679782
2,6908899,6908899,2,2023-04-30 01:08:10,2023-04-30 01:20:46,2.0,2.85,1.0,N,114,...,4.12,0.0,1.0,24.72,2.5,0.0,,2.0,0.06255,-0.170523
3,4617323,4617323,1,2023-05-19 17:10:40,2023-05-19 17:23:00,1.0,1.8,1.0,N,229,...,3.7,0.0,1.0,22.3,2.5,0.0,,1.0,0.05506,-0.279779
4,14262454,14262454,2,2023-02-21 21:34:26,2023-02-21 21:38:01,1.0,1.23,1.0,N,48,...,2.44,0.0,1.0,14.64,2.5,0.0,,1.0,0.031353,-0.625605


In [None]:
bin_names = ["cheap", "medium", "high", "expensive"]
df["price_range"]=  pd.cut(df.total_amount_minmax, [0,0.1, 0.2,0.3,1], labels=bin_names)
df.head(100)
#df[ (df["total_amount"] > 100)]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax,total_amount_z,price_range
0,9194462,9194462,2,2023-04-09 17:39:57,2023-04-09 18:13:31,2.0,9.81,1.0,N,132,...,0.0,1.0,51.25,0.0,1.75,,2.0,0.144661,1.027230,medium
1,16987466,16987466,2,2023-01-26 14:47:44,2023-01-26 14:53:07,1.0,0.99,1.0,N,170,...,0.0,1.0,13.44,2.5,,0.0,1.0,0.027639,-0.679782,cheap
2,6908899,6908899,2,2023-04-30 01:08:10,2023-04-30 01:20:46,2.0,2.85,1.0,N,114,...,0.0,1.0,24.72,2.5,0.00,,2.0,0.062550,-0.170523,cheap
3,4617323,4617323,1,2023-05-19 17:10:40,2023-05-19 17:23:00,1.0,1.80,1.0,N,229,...,0.0,1.0,22.30,2.5,0.00,,1.0,0.055060,-0.279779,cheap
4,14262454,14262454,2,2023-02-21 21:34:26,2023-02-21 21:38:01,1.0,1.23,1.0,N,48,...,0.0,1.0,14.64,2.5,0.00,,1.0,0.031353,-0.625605,cheap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,15807946,15807946,2,2023-02-07 08:05:36,2023-02-07 08:14:06,1.0,1.77,1.0,N,158,...,0.0,1.0,17.64,2.5,0.00,,1.0,0.040638,-0.490164,cheap
96,15272330,15272330,2,2023-02-11 21:50:50,2023-02-11 22:13:47,1.0,6.29,1.0,N,79,...,0.0,1.0,41.40,2.5,0.00,,1.0,0.114175,0.582531,medium
97,8595585,8595585,2,2023-04-15 03:59:23,2023-04-15 03:59:26,1.0,0.00,5.0,N,263,...,0.0,1.0,38.00,0.0,0.00,,1.0,0.103652,0.429031,medium
98,18699456,18699456,2,2023-01-09 21:06:20,2023-01-09 21:17:17,1.0,5.73,1.0,N,88,...,0.0,1.0,32.70,2.5,,0.0,1.0,0.087249,0.189751,cheap


In [None]:
df["price_range_quantile"] = pd.qcut(df.total_amount_minmax,4, labels=bin_names)
df.head(100)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax,total_amount_z,price_range,price_range_quantile
0,9194462,9194462,2,2023-04-09 17:39:57,2023-04-09 18:13:31,2.0,9.81,1.0,N,132,...,1.0,51.25,0.0,1.75,,2.0,0.144661,1.027230,medium,expensive
1,16987466,16987466,2,2023-01-26 14:47:44,2023-01-26 14:53:07,1.0,0.99,1.0,N,170,...,1.0,13.44,2.5,,0.0,1.0,0.027639,-0.679782,cheap,cheap
2,6908899,6908899,2,2023-04-30 01:08:10,2023-04-30 01:20:46,2.0,2.85,1.0,N,114,...,1.0,24.72,2.5,0.00,,2.0,0.062550,-0.170523,cheap,high
3,4617323,4617323,1,2023-05-19 17:10:40,2023-05-19 17:23:00,1.0,1.80,1.0,N,229,...,1.0,22.30,2.5,0.00,,1.0,0.055060,-0.279779,cheap,high
4,14262454,14262454,2,2023-02-21 21:34:26,2023-02-21 21:38:01,1.0,1.23,1.0,N,48,...,1.0,14.64,2.5,0.00,,1.0,0.031353,-0.625605,cheap,cheap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,15807946,15807946,2,2023-02-07 08:05:36,2023-02-07 08:14:06,1.0,1.77,1.0,N,158,...,1.0,17.64,2.5,0.00,,1.0,0.040638,-0.490164,cheap,medium
96,15272330,15272330,2,2023-02-11 21:50:50,2023-02-11 22:13:47,1.0,6.29,1.0,N,79,...,1.0,41.40,2.5,0.00,,1.0,0.114175,0.582531,medium,expensive
97,8595585,8595585,2,2023-04-15 03:59:23,2023-04-15 03:59:26,1.0,0.00,5.0,N,263,...,1.0,38.00,0.0,0.00,,1.0,0.103652,0.429031,medium,expensive
98,18699456,18699456,2,2023-01-09 21:06:20,2023-01-09 21:17:17,1.0,5.73,1.0,N,88,...,1.0,32.70,2.5,,0.0,1.0,0.087249,0.189751,cheap,expensive


In [None]:
df["passenger_count_corrected"] = df["passenger_count"] 
df["passenger_count_corrected"].fillna(df["passenger_count"].median(), inplace=True)

In [None]:
df.price_range_quantile.unique()

['expensive', 'cheap', 'high', 'medium']
Categories (4, object): ['cheap' < 'medium' < 'high' < 'expensive']

## Advanced Scaling

In [None]:
from  sklearn import preprocessing
out = preprocessing.MinMaxScaler().fit_transform(df[["total_amount"]])
np.squeeze(out)==df.total_amount_minmax

0         False
1          True
2         False
3         False
4         False
          ...  
999995     True
999996     True
999997    False
999998     True
999999    False
Name: total_amount_minmax, Length: 989697, dtype: bool

In [None]:
print(out[0],df.total_amount_minmax[0])

[0.1446611] 0.144661095636026


### Take the precision into account and compare them if there almost samt

In [None]:
np.allclose(np.squeeze(out),df.total_amount_minmax)

True

### If we have a lot of outliers 
We removed those above thought.... but this is anouther way.... using preprocessing

In [None]:
preprocessing.StandardScaler().fit_transform(df[["total_amount"]])

array([[ 1.02723048],
       [-0.67978225],
       [-0.1705227 ],
       ...,
       [ 1.37666921],
       [ 0.39833105],
       [-0.06758726]])