In [None]:
# Importing the required packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
import plotly as px

In [None]:
train=pd.read_csv("train.csv", nrows=1000000,parse_dates=['pickup_datetime'])
test=pd.read_csv("test.csv")

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.describe()

# Observations:
### Minimum Fare Amount is Negative (Not Possible)
### Maximum Number of Passengers is 208 (Not Possible)
### Maximum Fare Amount in Dollars is 500 (Hard to be True)
### Minimum and Maximum values of both Latitudes and Longitudes are unrealistic
####     Latitude Range : 40.49° North to 40.91° North
####    Longitude Range : -74.03° West to -73.77° West
### Fare Range in New York : 3$ - 200$


In [None]:
# Dropping those entries whose fare amount exceeds the Fare Range

train=train.drop(train[train['fare_amount']>200].index,axis=0)
train=train.drop(train[train['fare_amount']<3.5].index,axis=0)

In [None]:
# Dropping those entries whose Latitude & Longitudes are unrealistic and fall out of New York's range of Latitudes and Longitudes

train=train.drop(train[train['dropoff_longitude']>-73.77].index,axis=0)
train=train.drop(train[train['dropoff_longitude']<-74.03].index,axis=0)
train=train.drop(train[train['pickup_longitude']>-73.77].index,axis=0)
train=train.drop(train[train['pickup_longitude']<-74.03].index,axis=0)

train=train.drop(train[train['dropoff_latitude']>40.91].index,axis=0)
train=train.drop(train[train['dropoff_latitude']<40.49].index,axis=0)
train=train.drop(train[train['pickup_latitude']>40.91].index,axis=0)
train=train.drop(train[train['pickup_latitude']<40.49].index,axis=0)

In [None]:
# Dropping entries with Passenger Count < 0 and > 6
train=train.drop(train[train['passenger_count']<0].index,axis=0)
train=train.drop(train[train['passenger_count']>6].index,axis=0)

In [None]:
# Dropping entries with NaNs in Location


train=train.drop(train[train['dropoff_latitude']==np.NaN].index,axis=0)
train=train.drop(train[train['dropoff_longitude']==np.NaN].index,axis=0)
train=train.drop(train[train['pickup_longitude']==np.NaN].index,axis=0)
train=train.drop(train[train['pickup_latitude']==np.NaN].index,axis=0)


In [None]:
train.describe()

In [None]:
train.shape

In [None]:
# Grouping the Pickup & Dropoff Coordinates

train['pickup_coords']=train[['pickup_latitude','pickup_longitude']].apply(tuple,axis=1)
train['dropoff_coords']=train[['dropoff_latitude','dropoff_longitude']].apply(tuple,axis=1)


In [None]:
train.head()

In [None]:
# Calculating the Distance using Haversine Formula in Kilometers

import haversine as hs
from haversine import Unit
train['haversine distance']=train.apply(lambda x: hs.haversine(x.pickup_coords,x.dropoff_coords,unit=Unit.KILOMETERS),axis=1)

In [None]:
train.head()

In [None]:
train["year"] = train.pickup_datetime.dt.year - 2000
train["month"] = train.pickup_datetime.dt.month
train["week"] = train.pickup_datetime.dt.isocalendar().week

train['day of year'] = train['pickup_datetime'].dt.dayofyear
train["day of month"] = train.pickup_datetime.dt.day
train["day of week"] = train.pickup_datetime.dt.weekday

train["hour"] = train.pickup_datetime.dt.hour
train['minute'] =train['pickup_datetime'].dt.minute
train['second'] = train['pickup_datetime'].dt.second

In [None]:
train.head()

In [None]:
train['Fare per Kilometer']=train['fare_amount']/train['haversine distance']

In [None]:
train.head()

In [None]:
# Dropping those entries which have the same Pickup and Dropoff Coordinates

train=train.drop(train[train['pickup_coords']==train['dropoff_coords']].index)

In [None]:
train.shape

In [None]:
train.plot(x='pickup_longitude', y='pickup_latitude',kind='scatter',alpha=0.7,s=0.01,c='green')
plt.ylim(40.49,40.91)
plt.xlim(-74.03,-73.77)
plt.title("PICKUP PLOT")
plt.style.use('dark_background')

In [None]:
train.plot(x='dropoff_longitude', y='dropoff_latitude',kind='scatter',alpha=0.7,s=0.01,c='blue')
plt.ylim(40.49,40.91)
plt.xlim(-74.03,-73.77)
plt.title("DROPOFF PLOT")
plt.style.use('dark_background')
plt.show()

# Center Coordinates of NYC: (40.730,-73.935)

In [None]:
import folium
drop_map=folium.Map(location=[40.730,-73.935])
long_trips=train[train['haversine distance']>=10]
print(long_trips.shape)
for index,row in long_trips.iterrows():
    folium.CircleMarker([row['dropoff_latitude'],row['dropoff_longitude']],radius=3,color='blue',fill_opacity=0.9).add_to(drop_map)
    folium.CircleMarker([row['pickup_latitude'],row['pickup_longitude']],radius=3,color='green',fill_opacity=0.9).add_to(drop_map)
drop_map

In [None]:
import plotly.express as px
mean_fare = train["fare_amount"].mean()
median_fare = train["fare_amount"].median()

fig = px.histogram(train, x="fare_amount", nbins=0, histfunc="count")

fig.update_xaxes(range=[0, 100])
fig.add_vline(x=0,line=dict(color='white'))
fig.add_vline(x=mean_fare, line=dict(color='white',),annotation_text='Mean fare', annotation_position="top right")
fig.add_vline(x=median_fare, line=dict(color='green'),annotation_text='Median Fare', annotation_position="top left")

fig.update_layout(
    paper_bgcolor="black",
    
    title="FARE AMOUNT DISTRIBUTION",
    xaxis_title="Fare Amount",
    yaxis_title="Count",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)

fig.show()

In [None]:
mean_fare = train["haversine distance"].mean()
median_fare = train["haversine distance"].median()

fig = px.histogram(train, x="haversine distance", nbins=0, histfunc="count")

fig.update_xaxes(range=[0,30])
fig.add_vline(x=0,line=dict(color='white'))
fig.add_vline(x=mean_fare, line=dict(color='white',),annotation_text='Mean H Dist', annotation_position="top right")
fig.add_vline(x=median_fare, line=dict(color='green'),annotation_text='Median H Dist', annotation_position="top left")

fig.update_layout(
    paper_bgcolor="black",
    width=2100,
    height=1000,
    title="HAVERSINE DISTANCE AMOUNT DISTRIBUTION",
    xaxis_title="Haversine Distance",
    yaxis_title="Count",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)

fig.show()

In [None]:
fig = px.histogram(train, x="passenger_count", nbins=7, histfunc="count")
fig.update_traces(marker_line_color='white', marker_line_width=2)

fig.add_vline(x=-1,line=dict(color='white'))

fig.update_layout(
    paper_bgcolor="black",
    width=1400,
    height=1000,
    title="Passenger Count Frequency",
    xaxis_title="Passenger Count",
    yaxis_title="Frequency",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)
fig.show()

In [None]:
fig = px.scatter(train,x='passenger_count',y='fare_amount')
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    paper_bgcolor="black",
    width=2100,
    height=1000,
    title="FARE WITH NUMBER OF PASSENGERS IN THE CAB",
    xaxis_title="Number of Passengers",
    yaxis_title="Fare",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)
fig.show()

In [None]:
train=train.drop('key',axis=1)
train.head()

In [None]:
train.dtypes

In [None]:
new_train=train.drop(['pickup_datetime','pickup_coords','dropoff_coords'],axis=1)

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(25,20))
heatmap = sns.heatmap(new_train.corr(),
                       linewidths=0.1,
                       vmax=1.0,
                       vmin=0,
                       square=True,
                       cmap='Blues',
                       linecolor='black',
                       annot=True)

heatmap.set_title('CORRELATION HEATMAP', fontdict={'fontsize': 30}, pad=12, color='white')
plt.show()


In [None]:

fig = px.scatter(train,y='fare_amount',x='year')
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    paper_bgcolor="black",
    width=1400,
    height=1000,
    title="FARE OVER THE YEARS",
    xaxis_title="Year in 2000s",
    yaxis_title="Fare in $",
    font_family="Arial",  
    font_size=20,
    font_color='White',
    plot_bgcolor="black",
)
fig.show()

In [None]:
fig = px.scatter(train,y='fare_amount',x='month')
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    paper_bgcolor="black",
    width=1400,
    height=1000,
    title="FARE OVER THE MONTHS",
    xaxis_title="Months of the Year",
    yaxis_title="Fare in $",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)
fig.show()

In [None]:
fig = px.scatter(train,y='fare_amount',x='day of month')
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    paper_bgcolor="black",
    width=1400,
    height=1000,
    title="FARE OVER THE DAYS OF MONTH",
    xaxis_title="Day of the Month",
    yaxis_title="Fare in $",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)
fig.show()

In [None]:
fig = px.histogram(train, x="hour", nbins=24, histfunc="count")

fig.add_vline(x=-1,line=dict(color='white'))
fig.update_traces(marker_line_color='white', marker_line_width=2)

fig.update_layout(
    paper_bgcolor="black",
    width=1400,
    height=1000,
    title="Number of Rides over the Hours of the Day",
    xaxis_title="Hour",
    yaxis_title="Frequency",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)
fig.show()

In [None]:
fig = px.scatter(train,y='fare_amount',x='hour')
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    paper_bgcolor="black",
    width=1400,
    height=1000,
    title="FARE OVER THE HOURS OF THE DAY",
    xaxis_title="Hour of the Day",
    yaxis_title="Fare in $",
    font_family="Arial",  
    font_size=15,
    font_color='White',
    plot_bgcolor="black",
)
fig.show()

In [None]:
def time_slicer(df, timeframes, value, color="purple"):
    f,ax = plt.subplots(len(timeframes), figsize = [12,12])
    plt.style.use('dark_background')
    for i,x in enumerate(timeframes):
        df.loc[:,[x,value]].groupby([x]).mean().plot(ax=ax[i],color=color)
        ax[i].set_ylabel(value.title(),color='white')
        ax[i].set_title("{} Variation by {}".format(value.title(), x.title()),color='white')
        ax[i].set_xlabel("")
        ax[i].tick_params(colors='white')
        ax[i].grid(color='white', linestyle='--', linewidth=0.5, which='both', alpha=0.7)
    plt.tight_layout(pad=0)

In [None]:
train.dtypes

In [None]:
time_slicer(df=train, timeframes=['hour', 'day of week','day of month', 'week', 'month', 'year',], value = "fare_amount", color="blue")

In [None]:
time_slicer(df=train, timeframes=['hour', 'day of week','day of month', 'week', 'month', 'year',], value = "haversine distance", color="red")

In [None]:
time_slicer(df=train, timeframes=['hour', 'day of week','day of month', 'week', 'month', 'year',], value = "Fare per Kilometer", color="green")

In [None]:
fig,subplt=plt.subplots(1,2,figsize=(18,9))
subplt[0].scatter(train['haversine distance'], train['fare_amount'], alpha=0.2,color='blue')
subplt[0].set_xlabel('H Distance in Kms')
subplt[0].set_ylabel('Fare in $USD')
subplt[0].set_title('ALL DATA')
subplt[0].grid(color='white', linestyle='--', linewidth=0.5, which='both', alpha=0.7)

idx= ( train['haversine distance'] < 30 ) & ( train['fare_amount'] < 90 )

subplt[1].scatter(train[idx]['haversine distance'], train[idx]['fare_amount'], alpha=0.2,color='blue')
subplt[1].set_xlabel('H Distance in Kms')
subplt[1].set_ylabel('Fare in $USD')
subplt[1].set_title('ZOOMED GRAPH')
subplt[1].grid(color='white', linestyle='--', linewidth=0.5, which='both', alpha=0.7)


In [None]:
train.dtypes

In [None]:
fig, ax = plt.subplots(figsize=(18, 9))
numeric_columns = [col for col in train.columns if train[col].dtype != 'object']
passenger_fare = train[numeric_columns].groupby(['passenger_count']).mean()
sns.barplot(x=passenger_fare.index, y=passenger_fare['fare_amount'], palette = "Set3")
plt.xlabel('Number of Passengers')
plt.ylabel('Average Fare Price')
plt.title('AVERAGE FARE PRICE VERSUS NUMBER OF PASSENGERS')
plt.grid(color='white', linestyle='--', linewidth=0.5, alpha=0.7)
plt.show()

In [None]:
train.head()