In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv("../input/uber-request-data/Uber Request Data.csv")
df.head(5)

In [None]:
df.dtypes

In [None]:
df["Request timestamp"] = pd.to_datetime(df["Request timestamp"])
df["Drop timestamp"] = pd.to_datetime(df["Drop timestamp"])

In [None]:
df.columns = [col.replace(' ', '_') for col in df.columns]
df=df.drop(['Request_id','Driver_id','Drop_timestamp'], axis=1)
df.head()

In [None]:
df['Status'].unique()

### Session Details
    Late Night: Midnight to 4AM
    Early Morning: 4AM-8AM
    Late Morning: 8AM-Noon
    Afternoon: Noon-4PM
    Evening: 4PM-8PM
    Night: 8PM-Midnight

In [None]:
Session_lebels = ['Late Night','Morining','Late Morning','Sfternoon','Evining','Night']
df = df.assign(session = pd.cut(df.Request_timestamp.dt.hour,[-1,4,8,12,16,20,24],labels=Session_lebels))
df.head()

In [None]:
# plotting share/frequency of all "Trip Status" over the day to identify problem areas
plt.style.use('ggplot')
colors = ["#CC2529", "#8E8D8D","#008000"]
df.groupby(['session','Status']).Status.count().unstack().plot.bar(legend=True, figsize=(15,10), color=colors)
plt.title('Total Count of all Trip Status')
plt.xlabel('Sessions')
plt.ylabel('Total Count of Trip Status')
plt.show()

## Observations:

#### 1. Maximum number of "No Cars Available" status trips can be seen in Evening Session (Time Slot: 4PM                           to 8PM), followed by Night session (8PM to Midnight).
#### 2. Maximum number of "Cancelled" status trips can be seen in Early Morning Session (Time Slot: 4AM                             to 8AM), followed by Late Morning session (8AM to Noon).
#### 3. Let's drill down further in each category to do a deeper analysis**

In [None]:
# Filtering out only "Cancelled"  trips
df_tripscancelled=df[df["Status"].str.contains('Cancelled')==True]
df_tripscancelled=df_tripscancelled.reset_index(drop=True)
df_tripscancelled.head()

In [None]:
# plotting share/frequency of all Cancelled trips over the day to identify problem areas
plt.style.use('ggplot')
colors = ["#20B2AA", "#9400D3"]
df_tripscancelled.groupby(['session','Pickup_point']).Pickup_point.count().unstack().plot.bar(legend=True, figsize=(15,10), color=colors)
plt.title('Count and Distribution of all "Cancelled" Trips over the day')
plt.xlabel('Sessions')
plt.ylabel('Total Count of "Cancelled" Trips')
plt.show()

##### Observation¶
###### Maximum Cancellations happen in "Early Morning" (4AM-8AM) session where the pick up is "City", followed by Late Morning session (8AM-Noon)
##### Inference:
###### Most number of cancellations happen in the Morning hours (4AM-8AM) in the City to Airport route

In [None]:
# Filtering out only "Cancelled"  trips
df_nocar=df[df["Status"].str.contains('No Car')==True]
df_nocar=df_nocar.reset_index(drop=True)
df_nocar.head()

In [None]:
plt.style.use('ggplot')
colors = ["#20B2AA", "#9400D3"]
df_nocar.groupby(['session','Pickup_point']).Pickup_point.count().unstack().plot.bar(legend=True, figsize=(15,10), color=colors)
plt.title('Count and Distribution of all "No Car Available" Trips over the day')
plt.xlabel('Sessions')
plt.ylabel('Total Count of "No Car Availble" Trips')
plt.show()

## Observation :
##### 1. Maximum "No Cars Available" trips happen in "Evening" (4PM-8PM) session where the pick up is "Airport", followed by Night session (8PM-Midnight)
## Inference
#### 2.Most number of No Cars Available happen in the Evening Hours (4PM-8PM) in the Airport to City route

## Let's drill down further by Pick-Up Point

In [None]:
# Filtering out trips in the City to Airport route. Pick-up Point - City
df_citytoairport=df[df["Pickup_point"].str.contains('City')==True]
plt.style.use('ggplot')
colors = ["#CC2529", "#8E8D8D","#008000"]
df_citytoairport.groupby(['session','Status']).Status.count().unstack().plot.bar(legend=True, figsize=(15,10), color=colors)
plt.title('Total count of all Trip Statuses over the day for City to Airport route')
plt.xlabel('Sessions')
plt.ylabel('Total Count of Trips')
plt.show()

###### We see that maximum trips are cancelled from City to airport in the Early Morning session followed by Late Morning Session.

In [None]:
df["Supply_demand"] = ["supply" if x == "Trip Completed" else "Demand" for x in df["Status"]]
df.head(5)

In [None]:
#Plotting Supply and Demand on the City to Airport Route
df_city_to_airport_supply_demand=df[df["Pickup_point"].str.contains('City')==True]
plt.style.use('ggplot')
df_city_to_airport_supply_demand.groupby(['session','Supply_demand']).Supply_demand.count().unstack().plot.line(legend=True, figsize=(15,10))
plt.title('Supply-Demand curve for City to Airport Route')
plt.xlabel('Sessions')
plt.ylabel('Supply/Demand')
plt.show()

##### We observe that the Demand is very high in the morning from City to Airport route but the supply is very low

In [None]:
#Plotting Supply and Demand on the Airport to City route
df_airport_to_city_supply_demand=df[df["Pickup_point"].str.contains('Airport')==True]
plt.style.use('ggplot')
df_airport_to_city_supply_demand.groupby(['session','Supply_demand']).Supply_demand.count().unstack().plot.line(legend=True, figsize=(15,10))
plt.title('Supply-Demand curve for Airport to City Route')
plt.xlabel('Sessions')
plt.ylabel('Supply/Demand')
plt.show()

**We observe that the Demand is very high in the evening in the Airport to City route but the supply is very low******


**Correlating these two plots, a possible hypothesis could be:******


The demand during morning and afternoon hours from the Airport to City is quite low, which means that a driver who completed a City to Airport route in the morning hours may have to wait a really long time to get a ride back to the City, or he may have to come back without a passenger. Both situations are not idle for drivers, which might be the reason for highest number of "Cancelled" trips in the City to Airport trip in the morning hours
The vice versa happens in evening when there are too many flight coming in to the airport and hence there is a high demand for cabs and not enough supply. which is why most number of "No Cars Available" were observed in Evening in the Airport to City route.






**Possible Solution:**

1.Give incentives/surge pricing/bonus for trips from City to Airport during Morning hours.
2.Give incentives/surge pricing/bonus for trips from Airport to City during Evening hours.
3.Uber can give Gas expense to drivers qwhen they return from Airport without a ride or go to airpot for pick up without a ride.
4.Give incentives/surge pricing/bonus for trips from City to Airport during Morning hours.
5.Give incentives/surge pricing/bonus for trips from Airport to City during Evening hours.
6Uber can give Gas expense to drivers qwhen they return from Airport without a ride or go to airpot for pick up without a ride.
7.Uber can increase market share by marketing campaigns and offers to customers when demand is low.