<a href="https://colab.research.google.com/github/StephenSheng1101/DMAsgmnt/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler

In [None]:
transport = pd.read_csv("dataset.csv")

In [None]:

transport.describe()
transport.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596558 entries, 0 to 2596557
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ID              object 
 1   travel_fee      float64
 2   departure_time  object 
 3   departure_long  float64
 4   departure_lat   float64
 5   arrival_long    float64
 6   arrival_lat     float64
 7   occupancy       int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 158.5+ MB


In [None]:
transport.isnull().sum()

ID                 0
travel_fee         0
departure_time     0
departure_long     0
departure_lat      0
arrival_long      44
arrival_lat       44
occupancy          0
dtype: int64

In [None]:
#handling missing value
transport["departure_long"].fillna(transport["departure_long"].mean(), inplace=True)
transport["departure_lat"].fillna(transport["departure_lat"].mean(), inplace=True)
transport["occupancy"].fillna(transport["occupancy"].mode()[0], inplace=True)
transport.drop(columns=['ID'], inplace=True)

# Drop rows with missing "departure_time","arrival_long" and "arrival_lat"
transport.dropna(subset=["departure_time", "arrival_long", "arrival_lat"],inplace = True)
transport.isnull().sum()

# Drop rows with a 0 value of occupancy
transport = transport.drop(transport[transport['occupancy'] <= 0].index)

In [None]:
transport.head()

Unnamed: 0,travel_fee,departure_time,departure_long,departure_lat,arrival_long,arrival_lat,occupancy
0,7.0,2013-07-02 19:54:00+00:00,-74.00536,40.728867,-74.008913,40.710907,1
1,5.5,2013-09-28 00:21:31+00:00,-74.014165,40.708941,-74.01631,40.716734,1
2,21.5,2013-06-16 03:18:00+00:00,-73.991075,40.760352,-73.941382,40.713292,1
3,9.5,2013-07-20 13:43:00+00:00,-74.002662,40.72363,-73.991722,40.748905,5
4,15.5,2013-11-05 22:57:17+00:00,-73.962397,40.712705,-73.996834,40.680403,2


In [None]:
# Convert departure_time to datetime
transport['departure_time'] = pd.to_datetime(transport['departure_time'])

# Extract features from departure_time
transport['hour_of_day'] = transport['departure_time'].dt.hour
transport['day_of_week'] = transport['departure_time'].dt.dayofweek


In [None]:
from sklearn.model_selection import train_test_split


train_set, test_set = train_test_split(transport.head(500000), test_size=0.2, random_state=42)

print("Length of train_set:", len(train_set))
print("Length of test_set:", len(test_set))

In [None]:
travelfee = train_set.drop("travel_fee", axis=1)
travelfee_labels = train_set["travel_fee"].copy()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

num_attribs = [ 'departure_long', 'departure_lat', 'arrival_long', 'arrival_lat', 'hour_of_day', 'day_of_week','occupancy']

num_pipeline = make_pipeline(
    StandardScaler()  # Scale features
)
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    # Apply numeric pipeline to numeric attributes
])



In [None]:
transport_prepared = preprocessing.fit_transform(transport)
transport_prepared


array([[-0.51734933, -0.13933522,  0.10789769, ...,  0.83932454,
        -1.04160923, -0.51574494],
       [-0.65604557, -0.14011792,  0.10571586, ..., -2.06546508,
         1.00306776, -0.51574494],
       [ 0.823381  , -0.13806539,  0.11134518, ..., -1.60681409,
         1.51423701, -0.51574494],
       ...,
       [ 2.65694529, -0.12666484,  0.11228498, ...,  0.07490622,
         1.00306776,  0.21130803],
       [-0.47111725, -0.13623989,  0.11150011, ...,  0.99220821,
         1.51423701, -0.51574494],
       [-0.47111725, -0.13712509,  0.11297623, ...,  0.68644088,
        -0.53043998, -0.51574494]])

In [None]:
preprocessing.get_feature_names_out()

array(['num__travel_fee', 'num__departure_long', 'num__departure_lat',
       'num__arrival_long', 'num__arrival_lat', 'num__hour_of_day',
       'num__day_of_week', 'num__occupancy'], dtype=object)

In [None]:
transport.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2596492 entries, 0 to 2596557
Data columns (total 9 columns):
 #   Column          Dtype              
---  ------          -----              
 0   travel_fee      float64            
 1   departure_time  datetime64[ns, UTC]
 2   departure_long  float64            
 3   departure_lat   float64            
 4   arrival_long    float64            
 5   arrival_lat     float64            
 6   occupancy       int64              
 7   hour_of_day     int64              
 8   day_of_week     int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(3)
memory usage: 198.1 MB
