## Import Modules

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [10]:
import os

print(os.getcwd())

c:\Users\ritik\Desktop\DS\fare


## Import Datasets

In [18]:
rides_df = pd.read_csv("Datatsets/cab_rides.csv")
weather_df = pd.read_csv("Datatsets/weather.csv")

## Exploratory Analysis

### Ride Data

In [27]:
print(rides_df.shape)
print(rides_df.info())
print(rides_df.isna().sum())

(693071, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693071 entries, 0 to 693070
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   distance          693071 non-null  float64
 1   cab_type          693071 non-null  object 
 2   time_stamp        693071 non-null  int64  
 3   destination       693071 non-null  object 
 4   source            693071 non-null  object 
 5   price             637976 non-null  float64
 6   surge_multiplier  693071 non-null  float64
 7   id                693071 non-null  object 
 8   product_id        693071 non-null  object 
 9   name              693071 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 52.9+ MB
None
distance                0
cab_type                0
time_stamp              0
destination             0
source                  0
price               55095
surge_multiplier        0
id                      0
product_id              

### Weather Data

In [28]:
print(weather_df.shape)
print(weather_df.info())
print(weather_df.isna().sum())

(6276, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6276 entries, 0 to 6275
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   temp        6276 non-null   float64
 1   location    6276 non-null   object 
 2   clouds      6276 non-null   float64
 3   pressure    6276 non-null   float64
 4   rain        894 non-null    float64
 5   time_stamp  6276 non-null   int64  
 6   humidity    6276 non-null   float64
 7   wind        6276 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 392.4+ KB
None
temp             0
location         0
clouds           0
pressure         0
rain          5382
time_stamp       0
humidity         0
wind             0
dtype: int64


## Cleaning

### Cleaning Ride Data

In [58]:
rides_df = rides_df.dropna(axis=0).reset_index(drop=True)
rides_df.shape

(637976, 10)

### Cleaning Weather Data

In [30]:
weather_df=weather_df.fillna(0)

### Creating a new Weather Dataframe, Averaging the data over location

In [None]:
avg_weather_df = weather_df.groupby('location').mean().reset_index(drop=False).drop(columns='time_stamp')

### Creating dataframe for weather at source & destination and merging with ride data

In [54]:
src_avg_weather = avg_weather_df.rename(columns={
    'location' : 'source',
    'temp' : 'source_temp',
    'clouds': 'source_clouds',
    'pressure': 'source_pressure', 
    'humidity': 'source_humidity', 
    'rain': 'source_rain',
    'wind': 'source_wind'
    })

dest_avg_weather = avg_weather_df.rename(columns={
    'location' : 'destination',
    'temp' : 'destination_temp',
    'clouds': 'destination_clouds',
    'pressure': 'destination_pressure', 
    'humidity': 'destination_humidity', 
    'rain': 'destination_rain',
    'wind': 'destination_wind'
    })

In [57]:
merged_rides_df = pd.merge(rides_df, src_avg_weather, on='source', how='inner')
merged_rides_df = pd.merge(merged_rides_df, dest_avg_weather, on='destination', how='inner')
print(merged_rides_df.shape)

(637976, 22)


## Preprocessing

In [87]:
def onehot_encode(df, column, prefix):
    df=df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix, drop_first=True)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df


In [99]:
def preprocess_data(df):
    df=df.copy()
    
    # Drop unneeded Columns
    df=df.drop(["id","product_id","time_stamp"],axis=1)
    
    # Binary Encoding
    df["cab_type"]=df["cab_type"].map({"Lyft":0, "Uber":1})
    
    #onehot encoding
    for column,prefix in [('destination','dest'),('source','src'),('name','nm')]:
        df = onehot_encode(df, column, prefix)
    
    #train test split
    y=df["price"]
    x=df.drop(columns=["price"])
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=1, shuffle=True)

    #scale x
    scaler = StandardScaler()
    x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
    x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

    return x_train, x_test, y_train, y_test



In [101]:
x_train, x_test, y_train, y_test = preprocess_data(merged_rides_df)


## Training

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)
print("Model Test R-Square Score: {:.5f}".format(model.score(x_test, y_test)))

Model Test R-Square Score: 0.92854
