In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb
from lightgbm import LGBMRegressor
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sodapy import Socrata


In [3]:
client = Socrata("data.cityofnewyork.us", None)
results = client.get("djnb-wcxt", limit=100000)
green_taxi_2021_api = pd.DataFrame.from_records(results)



In [4]:
green_taxi_2021_api.head()

Unnamed: 0,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,ratecodeid,pulocationid,dolocationid,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-01-01T00:15:56.000,2021-01-01T00:19:52.000,N,1,43,151,1,1.01,5.5,0.5,0.5,0.0,0,0.3,6.8,2,1,0.0
1,2,2021-01-01T00:25:59.000,2021-01-01T00:34:44.000,N,1,166,239,1,2.53,10.0,0.5,0.5,2.81,0,0.3,16.86,1,1,2.75
2,2,2021-01-01T00:45:57.000,2021-01-01T00:51:55.000,N,1,41,42,1,1.12,6.0,0.5,0.5,1.0,0,0.3,8.3,1,1,0.0
3,2,2020-12-31T23:57:51.000,2021-01-01T00:04:56.000,N,1,168,75,1,1.99,8.0,0.5,0.5,0.0,0,0.3,9.3,2,1,0.0
4,2,2021-01-01T00:16:36.000,2021-01-01T00:16:40.000,N,2,265,265,3,0.0,-52.0,0.0,-0.5,0.0,0,-0.3,-52.8,3,1,0.0


In [None]:
class IngestData:

    def __init__(self, files_green=files_green, files_yellow=files_yellow, green_taxi_api=green_taxi_2021_api, yellow_taxi_api=None):
        self.files_green = files_green
        self.files_yellow = files_yellow
        self.green_taxi_data = None
        self.yellow_taxi_data = None
        

    def read_data_green(self, files):
        self.green_taxi_data = pd.concat([pd.read_parquet(file) for file in files])
        

    def read_data_yellow(self, files):
        self.yellow_taxi_data = pd.concat([pd.read_parquet(file) for file in files])
        
    def create_target(self):
            self.yellow_taxi_data['tpep_pickup_datetime'] = pd.to_datetime(self.yellow_taxi_data['tpep_pickup_datetime'])
            self.yellow_taxi_data['tpep_dropoff_datetime'] = pd.to_datetime(self.yellow_taxi_data['tpep_dropoff_datetime'])

            self.yellow_taxi_data['trip_duration'] = self.yellow_taxi_data['tpep_dropoff_datetime'] - self.yellow_taxi_data['tpep_pickup_datetime']
            self.yellow_taxi_data['trip_duration'] = self.yellow_taxi_data['trip_duration'].dt.total_seconds()

            self.green_taxi_data['lpep_pickup_datetime'] = pd.to_datetime(self.green_taxi_data['lpep_pickup_datetime'])
            self.green_taxi_data['lpep_dropoff_datetime'] = pd.to_datetime(self.green_taxi_data['lpep_dropoff_datetime'])

            self.green_taxi_data['trip_duration'] = self.green_taxi_data['lpep_dropoff_datetime'] - self.green_taxi_data['lpep_pickup_datetime']
            self.green_taxi_data['trip_duration'] = self.green_taxi_data['trip_duration'].dt.total_seconds()

    # def dropping_cols(self):
        # cols_to_keep = ['trip_distance', 'passenger_count', 'trip_duration', "store_and_fwd_flag", "VendorID"]

        # cols_to_drop_yellow = [col for col in self.yellow_taxi_data.columns if col not in cols_to_keep]
        # self.yellow_taxi_data.drop(columns=cols_to_drop_yellow, axis =1,  inplace=True)

        # cols_to_drop_green = [col for col in self.yellow_taxi_data.columns if col not in cols_to_keep]
        # self.green_taxi_data.drop(columns=cols_to_drop_green, axis=1,  inplace=True)

        # self.yellow_taxi_data = self.yellow_taxi_data[['trip_distance', 'passenger_count', 'trip_duration', "store_and_fwd_flag", "VendorID"]]
        # self.green_taxi_data = self.green_taxi_data[['trip_distance', 'passenger_count', 'trip_duration', "store_and_fwd_flag", "VendorID"]]

        
    def dup_and_miss(self):
        print(f"Number of duplicated rows in yellow taxi data: {self.yellow_taxi_data.duplicated().sum()}")
        print(f"Number of NA rows in yellow taxi data: {self.yellow_taxi_data.isna().sum().sum()}")
        print(f"Number of duplicated rows in green taxi data: {self.green_taxi_data.duplicated().sum()}")
        print(f"Number of NA rows in green taxi data: {self.green_taxi_data.isna().sum().sum()}")


    def outlier_removal(self):
        self.yellow_taxi_data = self.yellow_taxi_data[(self.yellow_taxi_data.trip_duration < 5600)]
        self.yellow_taxi_data = self.yellow_taxi_data[(self.yellow_taxi_data.trip_duration > 0)]
        self.yellow_taxi_data = self.yellow_taxi_data[(self.yellow_taxi_data.passenger_count > 0)]
        self.yellow_taxi_data = self.yellow_taxi_data[(self.yellow_taxi_data.trip_distance < 50000)]
        self.yellow_taxi_data = self.yellow_taxi_data[(self.yellow_taxi_data.fare_amount < 50000)]
        self.yellow_taxi_data = self.yellow_taxi_data[(self.yellow_taxi_data.total_amount < 50000)]




        self.green_taxi_data = self.green_taxi_data[(self.green_taxi_data.trip_duration < 5600)]
        self.green_taxi_data = self.green_taxi_data[(self.green_taxi_data.trip_duration > 0)]
        self.green_taxi_data = self.green_taxi_data[(self.green_taxi_data.passenger_count > 0)]
        self.green_taxi_data = self.green_taxi_data[(self.green_taxi_data.trip_distance < 50000)]
        self.green_taxi_data = self.green_taxi_data[(self.green_taxi_data.fare_amount < 50000)]
        self.green_taxi_data = self.green_taxi_data[(self.green_taxi_data.total_amount < 50000)]

