In [18]:
import pandas as pd

In [19]:
class DataPrep:
    def __init__(self, file):
        import pandas as pd
        data = pd.read_csv(file)
        self.data = data

    def change_to_datetime(self):
        self.data.pickup_datetime = pd.to_datetime(self.data.pickup_datetime)
        try:
            self.data.dropoff_datetime = pd.to_datetime(self.data.dropoff_datetime)
        except AttributeError:
            pass
    # Feature is already created in kaggle dataset

    # def target_creation(self):
    #     self.data['trip_duration'] = self.data['dropoff_datetime'] - self.data['pickup_datetime']
    #     self.data['trip_duration'] = self.data['trip_duration'].dt.total_seconds()
        
    def dup_and_miss(self):
        print(f"Number of duplicated rows: {self.data.duplicated().sum()}")
        print(f"Number of NA rows: {self.data.isna().sum().sum()}")

    def outlier_removal(self):
        self.data = self.data[(self.data.trip_duration < 5600)]
        self.data = self.data[(self.data.trip_duration > 0)]
        self.data = self.data[(self.data.passenger_count > 0)]

In [20]:
class DataPrepTest:
    def __init__(self, file):
        import pandas as pd
        data = pd.read_csv(file)
        self.data = data

    def change_to_datetime(self):
        self.data.pickup_datetime = pd.to_datetime(self.data.pickup_datetime)
        # try:
        #     self.data.dropoff_datetime = pd.to_datetime(self.data.dropoff_datetime)
        # except AttributeError:
        #     pass
    # Feature is already created in kaggle dataset

    # def target_creation(self):
    #     self.data['trip_duration'] = self.data['dropoff_datetime'] - self.data['pickup_datetime']
    #     self.data['trip_duration'] = self.data['trip_duration'].dt.total_seconds()
        
    def dup_and_miss(self):
        print(f"Number of duplicated rows: {self.data.duplicated().sum()}")
        print(f"Number of NA rows: {self.data.isna().sum().sum()}")

    # def outlier_removal(self):
    #     # self.data = self.data[(self.data.trip_duration < 5600)]
    #     # self.data = self.data[(self.data.trip_duration > 0)]
    #     self.data = self.data[(self.data.passenger_count > 0)]

In [21]:
prep = DataPrep('train.csv')
prep.change_to_datetime()
prep.dup_and_miss()
prep.outlier_removal()

Number of duplicated rows: 0
Number of NA rows: 0


In [22]:
test_prep = DataPrepTest('test.csv')
test_prep.change_to_datetime()
test_prep.dup_and_miss()


Number of duplicated rows: 0
Number of NA rows: 0


In [23]:
class FeatureEngineering:

    def __init__(self, prep):
        self.data = prep.data
        
    def one_hot(self):
        self.data = pd.concat([self.data, pd.get_dummies(self.data['store_and_fwd_flag'])], axis=1)
        self.data = pd.concat([self.data, pd.get_dummies(self.data['vendor_id'])], axis=1)
        self.data.drop(['store_and_fwd_flag'], axis=1, inplace=True)
        self.data.drop(['vendor_id'], axis=1, inplace=True)

    def date_features(self):
        self.data['month'] = self.data.pickup_datetime.dt.month
        self.data['day'] = self.data.pickup_datetime.dt.day
        self.data['hour'] = self.data.pickup_datetime.dt.hour
        self.data['minute'] = self.data.pickup_datetime.dt.minute
        self.data['day_of_week'] = self.data.pickup_datetime.dt.dayofweek
        # self.data['week'] = self.data.pickup_datetime.dt.isocalendar().week
        self.data['weekday'] = self.data.pickup_datetime.dt.weekday
        return self.data.info()

    def drop_cols(self):
        try:
            self.data = self.data.drop(['dropoff_datetime'], axis=1)
        except KeyError:
            pass
        self.data = self.data.drop(['pickup_datetime'], axis=1)
        # self.data = self.data.drop(['id'], axis=1)

        # # These cols don't exist in the kaggle dataset
        # # self.data = self.data.drop(['DOLocationID'], axis=1)
        # # self.data = self.data.drop(['PULocationID'], axis=1)
        # # self.data = self.data.drop(['airport_fee'], axis=1)
        # # self.data = self.data.drop(['RatecodeID'], axis=1)
        # # self.data = self.data.drop(['congestion_surcharge'], axis=1)
        # self.data = self.data.drop(['passenger_count'], axis=1)

    def cols_to_str(self):
        self.data.columns = self.data.columns.astype(str)
        



In [24]:
test_fe = FeatureEngineering(test_prep)


In [25]:
test_fe.one_hot()


In [26]:
test_fe.date_features()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   id                 625134 non-null  object        
 1   pickup_datetime    625134 non-null  datetime64[ns]
 2   passenger_count    625134 non-null  int64         
 3   pickup_longitude   625134 non-null  float64       
 4   pickup_latitude    625134 non-null  float64       
 5   dropoff_longitude  625134 non-null  float64       
 6   dropoff_latitude   625134 non-null  float64       
 7   N                  625134 non-null  uint8         
 8   Y                  625134 non-null  uint8         
 9   1                  625134 non-null  uint8         
 10  2                  625134 non-null  uint8         
 11  month              625134 non-null  int64         
 12  day                625134 non-null  int64         
 13  hour               625134 non-null  int64   

In [27]:
test_fe.drop_cols()
test_fe.cols_to_str()

In [28]:
fe = FeatureEngineering(prep)
fe.one_hot()
fe.date_features()
fe.drop_cols()
fe.cols_to_str()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1455721 entries, 0 to 1458643
Data columns (total 19 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   id                 1455721 non-null  object        
 1   pickup_datetime    1455721 non-null  datetime64[ns]
 2   dropoff_datetime   1455721 non-null  datetime64[ns]
 3   passenger_count    1455721 non-null  int64         
 4   pickup_longitude   1455721 non-null  float64       
 5   pickup_latitude    1455721 non-null  float64       
 6   dropoff_longitude  1455721 non-null  float64       
 7   dropoff_latitude   1455721 non-null  float64       
 8   trip_duration      1455721 non-null  int64         
 9   N                  1455721 non-null  uint8         
 10  Y                  1455721 non-null  uint8         
 11  1                  1455721 non-null  uint8         
 12  2                  1455721 non-null  uint8         
 13  month              1455721 

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

class Model:

    def __init__(self, fe):
        self.data = fe.data

    def train_test_split(self):
        y = self.data['trip_duration']
        X = self.data.drop(['trip_duration', 'id'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test

    def random_forest(self):
        from sklearn.ensemble import RandomForestRegressor
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        print(f"Random Forest RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")

    def light_gbm(self):
        from sklearn.metrics import mean_squared_error as MSE
        import lightgbm as lgb
        from lightgbm import LGBMRegressor
        import numpy as np
        lgbm = lgb.LGBMRegressor()
        lgbm.fit(X_train, y_train)
        print(lgbm.score(X_train, y_train), lgbm.score(X_test, y_test))
        print(f"MSE: {np.sqrt(MSE(y_test, lgbm.predict(X_test)))}")
       

    def light_preds(self):
        import numpy as np
        import lightgbm as lgb
        from lightgbm import LGBMRegressor
        lgbm = lgb.LGBMRegressor()
        lgbm.fit(X_train, y_train)
        test_x_data = test_fe.data.drop(['id'], axis = 1)
        preds = lgbm.predict(test_x_data)
        print(preds.shape)
        return preds

    def lrrr(self):
        from sklearn.linear_model import LinearRegression
        lr = LinearRegression()
        lr.fit(X_train, y_train)
        print(lr.score(X_train, y_train), lr.score(X_test, y_test))
        print(f"Linear Regression RMSE: {mean_squared_error(y_test, lr.predict(X_test), squared=False)}")


In [30]:
model = Model(fe)

X_train, X_test, y_train, y_test = model.train_test_split()
model.light_gbm()

0.7534109573529395 0.755279669413204
MSE: 318.92042732078653


In [31]:
# show which version of scikit-learn is installed
import sklearn
print(sklearn.__version__)

# u

1.2.0


In [32]:
model.lrrr()

0.09763574050983537 0.07058539167858668
Linear Regression RMSE: 621.515636199946


In [33]:
preds = model.light_preds()
preds

(625134,)


array([ 731.19844802,  595.60212724,  545.68000925, ..., 1358.77518918,
       1780.41291636, 1003.33188942])

In [34]:
sub = pd.DataFrame({'id': test_fe.data.id, 'trip_duration': preds})
sub.head()

Unnamed: 0,id,trip_duration
0,id3004672,731.198448
1,id3505355,595.602127
2,id1217141,545.680009
3,id2150126,999.437783
4,id1598245,469.48372


In [37]:
sub.to_csv("sub.csv", index = False)

ImportError: cannot import name 'FilePathOrBuffer' from 'pandas._typing' (c:\Users\rhys-\anaconda3\lib\site-packages\pandas\_typing.py)

In [None]:
sub_test = pd.read_csv('sub.csv')
sub_test.head(20)

Unnamed: 0,id,trip_duration
0,id3004672,731.198448
1,id3505355,595.602127
2,id1217141,545.680009
3,id2150126,999.437783
4,id1598245,469.48372
5,id0668992,786.484782
6,id1765014,1112.852261
7,id0898117,621.744437
8,id3905224,2251.0959
9,id1543102,566.961884
