In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In the analysis, some of the questions that have been answered are given below:

Which day of the week witnesses maximum rush for pickups, and at what time?

At what time during the day is the rush maximum?

Which places in and around New York have high pickup rates?

What is the demand versus supply chart of uber pickups in NYC?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Upload and group the files
files_2014 =['/kaggle/input/uberdata/uber-raw-data-may14.csv',
                '/kaggle/input/uberdata/uber-raw-data-apr14.csv',
                '/kaggle/input/uberdata/uber-raw-data-jul14.csv',
                '/kaggle/input/uberdata/uber-raw-data-jun14.csv',
                '/kaggle/input/uberdata/uber-raw-data-sep14.csv',
                '/kaggle/input/uberdata/uber-raw-data-aug14.csv']
df14 = pd.DataFrame()
for file in files_2014:
    df = pd.read_csv(file,encoding='utf-8')
    df14 = pd.concat([df,df14])

In [None]:
df14.head()

In [None]:
df14.shape

In [None]:
df14.dtypes

***preprocessing***

In [None]:
df14['Date/Time'] = pd.to_datetime(df14['Date/Time'],format = '%m/%d/%Y %H:%M:%S')

In [None]:
df14.dtypes

In [None]:
df14.head()

In [None]:
df14['month'] = df14['Date/Time'].dt.month
df14['weekday'] = df14['Date/Time'].dt.day_name()
df14['day'] = df14['Date/Time'].dt.day
df14['hour'] = df14['Date/Time'].dt.hour
df14['minute'] = df14['Date/Time'].dt.minute

In [None]:
df14.head()

In [None]:
df14.duplicated().sum()

In [None]:
df14.drop_duplicates(inplace =True)

In [None]:
df14['weekday'].value_counts()

In [None]:
plt.figure(figsize = (8,8))
sns.countplot(data = df14, x = 'weekday')
plt.show()

analysing trips by hours

In [None]:
plt.figure(figsize=(30,20))
for i,month in enumerate(df14['month'].unique()):
    plt.subplot(3,2,i+1)
    sns.histplot(data=df14,x=df14[df14['month'] == month]['hour'],bins=24)
    plt.xlabel('Hours in month {}'.format(month))
    plt.ylabel('Total Rides')

In [None]:
df14.groupby('month')['hour'].count()

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(data=df14,x=df14.groupby('month')['hour'].count().index,y=df14.groupby('month')['hour'].count())
plt.xticks(ticks=[0,1,2,3,4,5],labels=['April','May','June','July','August','September'])
plt.show()

Analysing trips by each day

In [None]:
df14.groupby('day')['hour'].count

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(data=df14,x=df14.groupby('day')['hour'].count().index,y=df14.groupby('day')['hour'].count())
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df14['day'])
plt.show()


analysing rides every month 

In [None]:
plt.figure(figsize=(30,20))
for i,month in enumerate(df14['month'].unique()):
    plt.subplot(3,2,i+1)
    sns.histplot(data=df14,x=df14[df14['month'] == month]['day'],bins=31)
    plt.xlabel('Days in month {}'.format(month))
    plt.ylabel('Total Rides')

traffic every hour

In [None]:
plt.figure(figsize=(15,10))
sns.pointplot(data=df14,x='hour',y='Lat',hue='weekday')
plt.title('HoursoffDay vs. Latitude of Passenger')
plt.show()

Analyse which base number gets popular by month name¶

In [None]:
df14.head()

In [None]:
base = df14.groupby(['Base','month'])['Date/Time'].count().reset_index()
base

In [None]:
plt.figure(figsize=(10,8))
sns.lineplot(data=base,x='month',y='Date/Time',hue='Base')

Perform Cross Analysis

Through my exploration i will go to visualize :

1) HeatMap by Hour and Weekday

2) HeatMap by Hour and day

3) HeatMap by month and day

2) HeatMap by month and weekday

In [None]:
def count_rows(row):
    return len(row)


def heatmap(col1,col2):
    by_cross = df14.groupby([col1,col2]).apply(count_rows)
    plt.figure(figsize=(12,8))
    sns.heatmap(by_cross.unstack())

In [None]:
heatmap('weekday','hour')

In [None]:
heatmap('day','hour')
plt.xlabel('Number of hours a day')
plt.ylabel('Number of days a month')

In [None]:
heatmap('month','day')
plt.xlabel('Number of days a month')

In [None]:
heatmap('month','weekday')

Analysis of location data points

In [None]:
df14.head()

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot(data=df14,y='Lat',x='Lon',alpha=0.4)
plt.xlim(-75,-72.5)
plt.ylim(40.0,41.2)

Perform spatial analysis using HeatMap to get a clear cut of rush

In [None]:
from folium.plugins import HeatMap
import folium

def plot(day):
    df_out = df14[df14['weekday']==day]
    rush = df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
    basemap = folium.Map()
    HeatMap(rush,zoom=20,radius=15).add_to(basemap)
    return basemap

In [None]:
plot('Sunday')

In [None]:
plot('Wednesday')

Analysing which base number has most active vehicles in January and February 2015

In [None]:
df_foil = pd.read_csv('/kaggle/input/uberdata/Uber-Jan-Feb-FOIL.csv')
df_foil.head()

In [None]:
df_foil.dtypes

In [None]:
df_foil['date'] = pd.to_datetime(df_foil['date'],format='%m/%d/%Y')

In [None]:
df_foil['dispatching_base_number'].unique()

In [None]:
sns.boxplot(data=df_foil,x='dispatching_base_number',y='active_vehicles')

Analysing which base number has most trips in January and February 2015

In [None]:
sns.boxplot(data=df_foil,x='dispatching_base_number',y='trips')

How average trips/vehicles increase/decrease with dates with each of base number in January and February 2015

In [None]:
df_foil['trips/vehicles'] = df_foil['trips'] / df_foil['active_vehicles']
df_foil

In [None]:
plt.figure(figsize=(10,6))
df_foil.set_index('date').groupby(['dispatching_base_number'])['trips/vehicles'].plot()
plt.title('Demand vs. Supply Chart')
plt.ylabel('Avg Trips/Vehicles')
plt.legend()
plt.show()