# 14 Scaling and Binding Numerical Data data

Required modules:
- pip install pandas
- pip install numpy
- pip install missingno
- pip install scipy
- pip install scikit-learn

In [9]:
import pandas as pd
import numpy as np

file = "../data/nyc_yellow_taxi_trip_selection.csv" # Takes about xx seconds...
#file = "../data/nyc_yellow_taxi_trip_records_from_Jan_to_Aug_2023.csv" # Huge takes almost 3 minutes to load

df = pd.read_csv(file, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"] )
#df = pd.read_csv(file, usecols=["VendorID","tpep_pickup_datetime","tpep_dropoff_datetime","passenger_count","trip_distance","RatecodeID","PULocationID","DOLocationID","tip_amount","total_amount"], parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"] )
df = df.replace("NaN", np.nan)
df["passenger_count_corrected"] = df["passenger_count"] 
df["passenger_count_corrected"].fillna(df["passenger_count"].median(), inplace=True)

In [10]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected
0,3560310,3560310,2,2023-05-29 15:40:01,2023-05-29 15:51:15,1.0,1.97,1.0,N,237,...,0.0,0.5,3.5,0.0,1.0,21.0,2.5,0.0,,1.0
1,16972558,16972558,1,2023-01-26 16:55:19,2023-01-26 17:01:28,2.0,0.3,1.0,N,114,...,5.0,0.5,2.0,0.0,1.0,15.0,2.5,,0.0,2.0
2,8617974,8617974,1,2023-04-14 23:14:24,2023-04-14 23:29:10,2.0,1.7,1.0,N,162,...,3.5,0.5,2.5,0.0,1.0,19.6,2.5,0.0,,2.0
3,17332136,17332136,2,2023-01-23 07:47:19,2023-01-23 08:10:05,1.0,8.31,1.0,N,132,...,0.0,0.5,7.45,0.0,1.0,44.7,0.0,,1.25,1.0
4,14128052,14128052,1,2023-02-23 08:49:52,2023-02-23 08:59:24,1.0,0.9,1.0,N,186,...,2.5,0.5,0.0,0.0,1.0,14.0,2.5,0.0,,1.0


## Remove extreme values (outliers)


In [11]:
from scipy import stats
q_low = df["total_amount"].quantile(0.01)
q_hi  = df["total_amount"].quantile(0.9999)

df = df[(df["total_amount"] < q_hi) & (df["total_amount"] > q_low)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 989553 entries, 0 to 999999
Data columns (total 23 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Unnamed: 0.1               989553 non-null  int64         
 1   Unnamed: 0                 989553 non-null  int64         
 2   VendorID                   989553 non-null  int64         
 3   tpep_pickup_datetime       989553 non-null  datetime64[ns]
 4   tpep_dropoff_datetime      989553 non-null  datetime64[ns]
 5   passenger_count            962619 non-null  float64       
 6   trip_distance              989553 non-null  float64       
 7   RatecodeID                 962619 non-null  float64       
 8   store_and_fwd_flag         962619 non-null  object        
 9   PULocationID               989553 non-null  int64         
 10  DOLocationID               989553 non-null  int64         
 11  payment_type               989553 non-null  int64        

## Min-Max Scaling with: 
$$x' = \frac{x - \text{min}(x)}{\text{max}(x)-\text{min}(x)}

In [12]:
df["total_amount_minmax"] = (df.total_amount-df.total_amount.min()) / (df.total_amount.max()-df.total_amount.min())
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax
0,3560310,3560310,2,2023-05-29 15:40:01,2023-05-29 15:51:15,1.0,1.97,1.0,N,237,...,0.5,3.5,0.0,1.0,21.0,2.5,0.0,,1.0,0.049277
1,16972558,16972558,1,2023-01-26 16:55:19,2023-01-26 17:01:28,2.0,0.3,1.0,N,114,...,0.5,2.0,0.0,1.0,15.0,2.5,,0.0,2.0,0.031347
2,8617974,8617974,1,2023-04-14 23:14:24,2023-04-14 23:29:10,2.0,1.7,1.0,N,162,...,0.5,2.5,0.0,1.0,19.6,2.5,0.0,,2.0,0.045093
3,17332136,17332136,2,2023-01-23 07:47:19,2023-01-23 08:10:05,1.0,8.31,1.0,N,132,...,0.5,7.45,0.0,1.0,44.7,0.0,,1.25,1.0,0.120099
4,14128052,14128052,1,2023-02-23 08:49:52,2023-02-23 08:59:24,1.0,0.9,1.0,N,186,...,0.5,0.0,0.0,1.0,14.0,2.5,0.0,,1.0,0.028359


## Z-Score with
$$x' = \frac{x - \bar{x}}{\sigma}

In [13]:
df["total_amount_z"] =  (df.total_amount - df.total_amount.mean()) / df.total_amount.std()
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax,total_amount_z
0,3560310,3560310,2,2023-05-29 15:40:01,2023-05-29 15:51:15,1.0,1.97,1.0,N,237,...,3.5,0.0,1.0,21.0,2.5,0.0,,1.0,0.049277,-0.339086
1,16972558,16972558,1,2023-01-26 16:55:19,2023-01-26 17:01:28,2.0,0.3,1.0,N,114,...,2.0,0.0,1.0,15.0,2.5,,0.0,2.0,0.031347,-0.60963
2,8617974,8617974,1,2023-04-14 23:14:24,2023-04-14 23:29:10,2.0,1.7,1.0,N,162,...,2.5,0.0,1.0,19.6,2.5,0.0,,2.0,0.045093,-0.402213
3,17332136,17332136,2,2023-01-23 07:47:19,2023-01-23 08:10:05,1.0,8.31,1.0,N,132,...,7.45,0.0,1.0,44.7,0.0,,1.25,1.0,0.120099,0.729564
4,14128052,14128052,1,2023-02-23 08:49:52,2023-02-23 08:59:24,1.0,0.9,1.0,N,186,...,0.0,0.0,1.0,14.0,2.5,0.0,,1.0,0.028359,-0.654721


In [14]:
bin_names = ["cheap", "medium", "high", "expensive"]
df["price_range"]=  pd.cut(df.total_amount_minmax, [0,0.1, 0.2,0.3,1], labels=bin_names)
df.head(100)
#df[ (df["total_amount"] > 100)]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax,total_amount_z,price_range
0,3560310,3560310,2,2023-05-29 15:40:01,2023-05-29 15:51:15,1.0,1.97,1.0,N,237,...,0.0,1.0,21.00,2.5,0.0,,1.0,0.049277,-0.339086,cheap
1,16972558,16972558,1,2023-01-26 16:55:19,2023-01-26 17:01:28,2.0,0.30,1.0,N,114,...,0.0,1.0,15.00,2.5,,0.00,2.0,0.031347,-0.609630,cheap
2,8617974,8617974,1,2023-04-14 23:14:24,2023-04-14 23:29:10,2.0,1.70,1.0,N,162,...,0.0,1.0,19.60,2.5,0.0,,2.0,0.045093,-0.402213,cheap
3,17332136,17332136,2,2023-01-23 07:47:19,2023-01-23 08:10:05,1.0,8.31,1.0,N,132,...,0.0,1.0,44.70,0.0,,1.25,1.0,0.120099,0.729564,medium
4,14128052,14128052,1,2023-02-23 08:49:52,2023-02-23 08:59:24,1.0,0.90,1.0,N,186,...,0.0,1.0,14.00,2.5,0.0,,1.0,0.028359,-0.654721,cheap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,17562676,17562676,1,2023-01-20 20:36:07,2023-01-20 20:41:15,1.0,0.80,1.0,N,141,...,0.0,1.0,13.80,2.5,,0.00,1.0,0.027761,-0.663739,cheap
96,18628976,18628976,2,2023-01-10 17:18:01,2023-01-10 17:35:35,1.0,2.15,1.0,N,142,...,0.0,1.0,24.80,2.5,,0.00,1.0,0.060632,-0.167741,cheap
97,16283019,16283019,1,2023-02-02 12:38:52,2023-02-02 12:42:17,0.0,0.70,1.0,N,107,...,0.0,1.0,11.75,2.5,0.0,,0.0,0.021635,-0.756175,cheap
98,1387911,1387911,2,2023-06-17 19:08:08,2023-06-17 19:20:01,4.0,0.89,1.0,N,246,...,0.0,1.0,15.40,2.5,0.0,,4.0,0.032542,-0.591594,cheap


In [15]:
df["price_range_quantile"] = pd.qcut(df.total_amount_minmax,4, labels=bin_names)
df.head(100)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,...,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee,passenger_count_corrected,total_amount_minmax,total_amount_z,price_range,price_range_quantile
0,3560310,3560310,2,2023-05-29 15:40:01,2023-05-29 15:51:15,1.0,1.97,1.0,N,237,...,1.0,21.00,2.5,0.0,,1.0,0.049277,-0.339086,cheap,medium
1,16972558,16972558,1,2023-01-26 16:55:19,2023-01-26 17:01:28,2.0,0.30,1.0,N,114,...,1.0,15.00,2.5,,0.00,2.0,0.031347,-0.609630,cheap,cheap
2,8617974,8617974,1,2023-04-14 23:14:24,2023-04-14 23:29:10,2.0,1.70,1.0,N,162,...,1.0,19.60,2.5,0.0,,2.0,0.045093,-0.402213,cheap,medium
3,17332136,17332136,2,2023-01-23 07:47:19,2023-01-23 08:10:05,1.0,8.31,1.0,N,132,...,1.0,44.70,0.0,,1.25,1.0,0.120099,0.729564,medium,expensive
4,14128052,14128052,1,2023-02-23 08:49:52,2023-02-23 08:59:24,1.0,0.90,1.0,N,186,...,1.0,14.00,2.5,0.0,,1.0,0.028359,-0.654721,cheap,cheap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,17562676,17562676,1,2023-01-20 20:36:07,2023-01-20 20:41:15,1.0,0.80,1.0,N,141,...,1.0,13.80,2.5,,0.00,1.0,0.027761,-0.663739,cheap,cheap
96,18628976,18628976,2,2023-01-10 17:18:01,2023-01-10 17:35:35,1.0,2.15,1.0,N,142,...,1.0,24.80,2.5,,0.00,1.0,0.060632,-0.167741,cheap,high
97,16283019,16283019,1,2023-02-02 12:38:52,2023-02-02 12:42:17,0.0,0.70,1.0,N,107,...,1.0,11.75,2.5,0.0,,0.0,0.021635,-0.756175,cheap,cheap
98,1387911,1387911,2,2023-06-17 19:08:08,2023-06-17 19:20:01,4.0,0.89,1.0,N,246,...,1.0,15.40,2.5,0.0,,4.0,0.032542,-0.591594,cheap,cheap


In [16]:
df["passenger_count_corrected"] = df["passenger_count"] 
df["passenger_count_corrected"].fillna(df["passenger_count"].median(), inplace=True)

In [17]:
df.price_range_quantile.unique()

['medium', 'cheap', 'expensive', 'high']
Categories (4, object): ['cheap' < 'medium' < 'high' < 'expensive']

## Advanced Scaling

In [19]:
from  sklearn import preprocessing
out = preprocessing.MinMaxScaler().fit_transform(df[["total_amount"]])
np.squeeze(out)==df.total_amount_minmax

0         False
1          True
2         False
3         False
4         False
          ...  
999995     True
999996    False
999997    False
999998    False
999999    False
Name: total_amount_minmax, Length: 989553, dtype: bool

In [20]:
print(out[0],df.total_amount_minmax[0])

[0.04927683] 0.0492768348075544


### Take the precision into account and compare them if there almost samt

In [21]:
np.allclose(np.squeeze(out),df.total_amount_minmax)

True

### If we have a lot of outliers 
We removed those above thought.... but this is anouther way.... using preprocessing

In [22]:
preprocessing.StandardScaler().fit_transform(df[["total_amount"]])

array([[-0.33908635],
       [-0.60963073],
       [-0.40221337],
       ...,
       [-0.37741347],
       [-0.52846742],
       [ 3.09772917]])