# Uber New York Data Analysis

## Importing Libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os   

## Collecting Data for Analysis

In [None]:
files=os.listdir(r'C:\Users\vikra\Desktop\Data Science and Machine Learning\Uber New York Data Analysis')[-8:-1]
files

In [None]:
files.remove('uber-raw-data-janjune-15.csv')
files

In [None]:
path=r'C:\Users\vikra\Desktop\Data Science and Machine Learning\Uber New York Data Analysis'

#blank dataframe
final=pd.DataFrame()

for file in files:
    df=pd.read_csv(path+"/"+file,encoding='utf-8')
    final=pd.concat([df,final])

In [None]:
final.head()

In [None]:
final.shape

## Data Preparation for Analysis

In [None]:
df = final.copy()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
#Converting the type of Date/Time from object to date-time format
df['Date/Time'] = pd.to_datetime(df['Date/Time'], format="%m/%d/%Y %H:%M:%S")

In [None]:
df.dtypes

In [None]:
#Fetching attributes from date-timestamp that can be used in our analysis which are weekday, day,minute,month and hour
df['weekday']=df['Date/Time'].dt.day_name()
df['day']=df['Date/Time'].dt.day
df['minute']=df['Date/Time'].dt.minute
df['month']=df['Date/Time'].dt.month
df['hour']=df['Date/Time'].dt.hour

In [None]:
df.head()

In [None]:
df.dtypes

## Analyzing trips by week-day

In [None]:
df['weekday'].unique()

In [None]:
import plotly.express as px

In [None]:
px.bar(x=df['weekday'].value_counts().index,
      y=df['weekday'].value_counts().values, title ='Num of rides per each weekday'
      )

We can see that Highest trips happened on Thursday and lowest on Sunday

## Analyzing trips by hour

In [None]:
plt.hist(df['hour'])

Number of trips are at peak during evening time mostly when people complete their work and go home

In [None]:
for i,month in enumerate(df['month'].unique()):
    print(month)

In [None]:
plt.figure(figsize=(70,40))
for i,month in enumerate(df['month'].unique()):
    plt.subplot(3,2,i+1)
    df[df['month']==month]['hour'].hist()

## Analyzing Monthly rides

In [None]:
for i in df['month'].unique():
    plt.figure(figsize=(10,5))
    df[df['month']==i]['hour'].hist()

### Analysis of which month has maximum rides

In [None]:
df.head()

In [None]:
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
trace1 = go.Bar( 
        x = df.groupby('month')['hour'].sum().index,
        y = df.groupby('month')['hour'].sum(),
        name= 'Priority')
iplot([trace1])

From this we can say that September or month-9 has maximum number of rides

### Analysis of Journey for each day

In [None]:
plt.figure(figsize=(15,8))
plt.hist(df['day'], bins=30, rwidth=.8, range=(0.5, 30.5))
plt.xlabel('date of the month')
plt.ylabel('Total Journeys')
plt.title('Journeys by Month Day')

In [None]:
sns.distplot(df['day'])

### Analyzing Demand for Ubers

In [None]:
plt.figure(figsize=(30,15))
for i,month in enumerate(df['month'].unique(),1):
    plt.subplot(3,2,i)
    df_out=df[df['month']==month]
    plt.hist(df_out['day'])
    plt.xlabel('days in month'.format(i))
    plt.ylabel('total rides')

From the graphs above, we can tell that demand is generally higher in the last days of every month

### Analyzing Rush by hour

In [None]:
sns.set_style(style='whitegrid')
sns.pointplot(x="hour",y="Lat",data=df)

In [None]:
plt.figure(figsize=(10,6))
ax=sns.pointplot(x="hour",y="Lat", hue="weekday",data=df)
ax.set_title('hoursoffday vs latiitide of passenger')

### Analyzing base number popularity by month

In [None]:
df.head()

In [None]:
df['Base'].head()

In [None]:
df.groupby(['Base','month'])['Date/Time'].count()

In [None]:
base=df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(x='month',y='Date/Time',hue='Base',data=base)

### Performing cross analysis

#### Heatmap by Hour and Weekday.

In [None]:
def count_rows(rows):
    return len(rows)

In [None]:
by_cross = df.groupby(['weekday','hour']).apply(count_rows)
by_cross

In [None]:
pivot=by_cross.unstack()
pivot

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(pivot, annot=False)

In [None]:
df.head()

In [None]:
def heatmap(col1,col2):
    by_cross = df.groupby([col1,col2]).apply(lambda x:len(x))
    pivot=by_cross.unstack()
    plt.figure(figsize=(12,8))
    return sns.heatmap(pivot,annot=False)

#### Heatmap by Hour and Day.

In [None]:
heatmap('day','hour')

#### Heatmap by Month and Day.

In [None]:
heatmap('day','month')

#### Heatmap by Month and weekDay.

In [None]:
heatmap('weekday','month')

### Analysis of Location Data Points

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10,6))

plt.plot(df['Lon'], df['Lat'],'r+', ms=0.5)
plt.xlim(-74.2, -73.7)
plt.ylim(40.6,41)

### Performing Spatial Analysis to get clear idea on rush

In [None]:
df.head()

In [None]:
df1=df[df['weekday']=='Sunday']
df1.head()

In [None]:
rush=df1.groupby(['Lat','Lon'])['weekday'].count().reset_index()
rush

In [None]:
from folium.plugins import HeatMap
import folium
from folium.plugins import HeatMap
basemap=folium.Map()

In [None]:
HeatMap(rush,zoom=20,radius=15).add_to(basemap)
basemap

In [None]:
def plot(df,day):
    df_out=df[df['weekday']==day]
    df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
    HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
    return basemap

In [None]:
#plot(df,'Monday')
#plot(df,'Tuesday')
#plot(df,'Wednesday')
#plot(df,'Thursday')So we can offer some discou
#plot(df,'Friday')
plot(df,'Saturday')

From the above visualizations we can say that the area which is brightest has more number of trips than any other. So we can also offer some discounts if anyone is booking from this zone to increase the sales

### Analyzing Uber pickups each month

#### Data Preparation

In [None]:
uber = pd.read_csv(r'C:\Users\vikra\Desktop\Data Science and Machine Learning\Uber New York Data Analysis/uber-raw-data-janjune-15.csv',encoding='utf-8')
uber.head()

In [None]:
uber.shape

In [None]:
#Checking the minimum date in the uber
uber['Pickup_date'].min()

In [None]:
#Checking the maximum date in the uber
uber['Pickup_date'].max()

In [None]:
uber['Pickup_date'] =  pd.to_datetime(uber['Pickup_date'], format='%Y-%m-%d %H:%M:%S')

In [None]:
uber['weekday']=uber['Pickup_date'].dt.day_name()
uber['day']=uber['Pickup_date'].dt.day
uber['minute']=uber['Pickup_date'].dt.minute
uber['month']=uber['Pickup_date'].dt.month
uber['hour']=uber['Pickup_date'].dt.hour

In [None]:
uber.head()

In [None]:
uber.dtypes

#### Uber pickups by each month in NYC

In [None]:
px.bar(x=uber['month'].value_counts().index,
           y=uber['month'].value_counts().values)

We can see that the number of Uber pickup has been steadily increasing throughout the first half of 2015 in NYC


### Analyzing rush in NYC

In [None]:
plt.figure(figsize=(12,6))
ax=sns.countplot(uber['hour'])
#ax.yaxis.set_major_formatter(tick.FormatStrFormatter('%.0f'))

Interestingly, after the morning rush, the number of Uber pickups doesn't dip much throughout the rest of the morning and early afternoon. There is significantly more demand in the evening than the daytime. Let's investigate to see if there's a difference in hourly pattern for different days of the week.

### In-depth Analysis of rush in NYC and hour wise

In [None]:
uber.groupby(['weekday', 'hour'])['Pickup_date'].count()

In [None]:
summary=uber.groupby(['weekday', 'hour'])['Pickup_date'].count().reset_index()
summary=summary.rename(columns = {'Pickup_date':'Counts'})
summary

In [None]:
plt.figure(figsize=(12,6))
sns.pointplot(x="hour", y="Counts", hue="weekday", data=summary)

In [None]:
uber_foil=pd.read_csv(r'C:\Users\vikra\Desktop\Data Science and Machine Learning\Uber New York Data Analysis/Uber-Jan-Feb-FOIL.csv')

In [None]:
uber_foil.head()

In [None]:
uber_foil['dispatching_base_number'].unique()

In [None]:
sns.boxplot(x = 'dispatching_base_number', y = 'active_vehicles', data = uber_foil)

more number of Active Vehicles in B02764

In [None]:
sns.boxplot(x = 'dispatching_base_number', y = 'trips', data = uber_foil)

More number of trips in B02764

In [None]:
# Finding the ratio of trips/active_vehicles
uber_foil['trips/vehicle'] = uber_foil['trips']/uber_foil['active_vehicles']

In [None]:
uber_foil.head()

In [None]:
uber_foil.set_index('date')

#### How Average trips/vehicle inc/decreases with dates with each of base number

In [None]:
plt.figure(figsize=(18,8))
uber_foil.set_index('date').groupby(['dispatching_base_number'])['trips/vehicle'].plot()
plt.ylabel('Average trips/vehicle')
plt.title('Demand vs Supply chart (Date-wise)')
plt.legend()

From above visualization we can say that:

    B02598 and B02764 have performed better than others

    B02512 performed less compared to others 

## Conclusion

From the above-analysis we can say that:
    
    Number of trips are at peak during evening time mostly when people complete their work and go home    
    September has maximum number of rides
    Number of trips are most on Thursday and lowest on Sunday    
    Demand is generally higher in the last days of every month    
    Number of Uber pickup has been steadily increasing throughout the first half of 2015 in NYC    
    The number of Uber pickups doesn't dip much throughout the rest of the morning and early afternoon.     
    There is significantly more demand in the evening than the daytime.     
    More number of trips and active vehicles are from base B02764    
    B02598 and B02764 have performed better than others
    B02512 performed less compared to others