In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
### Brief overview of the project:

The project involves analyzing a dataset containing data on multiple trips taken by Uber drivers. The file includes columns such as 'START_DATE', 'END_DATE', 'CATEGORY', 'START', 'STOP', 'MILES', and 'PURPOSE*'

### Objectives and Goals:

The primary goal of the project is to extract valuable insights and identify patterns or trends within the dataset that can provide meaningful information.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/kaggle/input/uber-dataset/My Uber Drives - 2016.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(10)

### Renaming the column names

In [None]:
df.rename(columns={'START_DATE*': 'start_date', 'END_DATE*':'end_date', 'CATEGORY*':'category', 'START*':'start', 'STOP*':'stop', 'MILES*':'miles',
       'PURPOSE*':'purpose'},inplace=True)

In [None]:
df.info()

### Handling the missing Values

In [None]:
# Finiding percentage of missing values in each column of the dataset
(df.isna().sum()/df.shape[0])*100

<strong>OBSERVATIONS:<strong>

Purpose columns has 43.46% missing values, suggesting that nearly half of the rows in this column do not have data.<br>
Hence we cannot delete the rows with null values .<br>
We can fill these with either Mode value of the column or 'Unknown' or simply delete this column.<br>
I am trying it out with 'unknown' value.

In [None]:
df['purpose'].fillna("Unknown",inplace=True)

In [None]:

(df.isna().sum()/df.shape[0])*100

As we see the percentage of missing values in all the four columns remains same, hence there is a possibility that the null values are present for the same entry in each column.
so, Lets first try just removing null values from one column

In [None]:
df.dropna(subset=['category'],inplace=True)

In [None]:
(df.isna().sum()/df.shape[0])*100

Hence we have have removed all the null values from our dataset

### Looking for duplicate data

In [None]:
df.duplicated().sum()

In [None]:
# Dropping duplicated rows based on all columns
df = df.drop_duplicates()

In [None]:
df.shape

### Fixing the datatypes of the columns

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# converting STARTDATE and ENDDATE into datetime datetype to extract time and date 
df['start_date'] = pd.to_datetime(df['start_date'],errors='coerce')
df['end_date'] = pd.to_datetime(df['end_date'],errors='coerce')

In [None]:
# extracting time from datetime format column for further analysis
df['start_time'] = df['start_date'].dt.time
df['end_time'] = df['end_date'].dt.time


In [None]:
df['month'] = df['start_date'].dt.month
df['year'] = df['start_date'].dt.year
df['hour'] = df['start_date'].dt.hour

In [None]:
# Convert time strings to datetime datatype
df['start_time'] = pd.to_datetime(df['start_time'], format='%H:%M:%S').dt.time
df['end_time'] = pd.to_datetime(df['end_time'], format='%H:%M:%S').dt.time

In [None]:
def calculate_duration_minutes(start, end):
    start_minutes = start.hour * 60 + start.minute
    end_minutes = end.hour * 60 + end.minute
    duration_minutes = end_minutes - start_minutes
    return duration_minutes

In [None]:
# Calculate duration in minutes and store in a new column
df['duration_minutes'] = df.apply(lambda row: calculate_duration_minutes(row['start_time'], row['end_time']), axis=1)

In [None]:
df.describe()

In [None]:
df=df[df['duration_minutes']>0]

In [None]:
df['start_date']=df['start_date'].dt.date
df['end_date']=df['end_date'].dt.date

In [None]:
df.head()

In [None]:
df.rename(columns={'start':'source','stop':'destination','miles':'distance'},inplace=True)

In [None]:
# preparing a new column that shows if the trip was one way or not
def round(x):
    if x['source'] == x['destination']:
        return 'Yes'
    else:
        return 'No'
df['round_trip'] = df.apply(round, axis = 1)
df.head()

### Looking into the column values

In [None]:
cat_cols=['category','source','destination','purpose','month','year','round_trip']
other_col=['start_date','end_date','distance','start_time','end_time','time_day_diff','hour','duration_minutes']

In [None]:
for col in cat_cols:
    print(f"Value counts for column '{col}':")
    print(df[col].value_counts())
    print()

In [None]:
df.drop('year', axis=1, inplace=True)

### Trying to look for outliers

In [None]:
df.describe()

In [None]:
df[df['distance']==310.3]

<strong>OBSERVATIONS:</strong>

Distribution of Distances: The data shows a wide range of distances covered by trips, ranging from very short distances (minimum of 0.5 miles) to longer journeys (maximum of 310.3 miles).The mean distance is far lower than the maximum value, which indicates the presence of extreme outliers.<br>

Average Distance: The average distance covered by trips is approximately 10.57 miles, indicating that, on average, trips tend to be of moderate length.<br>

Variability: The standard deviation of approximately 21.58 suggests a considerable variability in trip distances around the mean. This variability could indicate diverse trip lengths or outliers in the dataset.<br>

Quartiles: The quartile values (25%, 50%, 75%) provide insights into the distribution of trip lengths. For example, 25% of trips have a distance of 2.9 miles or less (25th percentile), while 75% have a distance of 10.4 miles or less (75th percentile).<br>

Distribution of Duration: The duration_minutes variable also has high variability, but it seems to follow a relatively more predictable distribution based on the median (16 minutes) and the 75th percentile (28 minutes).<br>

Both columns have relatively large standard deviations, indicating a spread in the values and suggesting the presence of both short and long events in the dataset.

In [None]:
plt.figure(figsize=(18,5))
plt.subplot(1,2,1)
sns.histplot(df['distance'],color='green',kde=True)
plt.xlabel('distance-travelled(miles)')
plt.ylabel('Frequency')
plt.title('Distribution Plot')



plt.subplot(1,2,2)
sns.boxplot(x=df['distance'],color='red')
plt.xlabel('distance-travelled(miles)')
plt.ylabel('Frequency')
plt.title('Box Plot')
plt.show()




In [None]:
percentile1=df['distance'].quantile(0.90)

In [None]:
df[df['distance']<percentile1].shape

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.histplot(df[df['distance']<percentile1]['distance'],kde=True,color='green')
plt.subplot(1,2,2)
sns.boxplot(x=df[df['distance']<percentile1]['distance'])
plt.show()

In [None]:
df=df[df['distance']<percentile1]

In [None]:
df.describe()

### **_`OBSERVATIONS:`_**

**A. Miles:**

- Average Trip Distance: The average distance covered per trip is approximately 6.08 miles. This indicates that, on average, trips are of moderate length.
- Variability: The standard deviation of 4.10 suggests that trip distances vary considerably around the mean. This variability could be due to different trip purposes or routes taken.
- Range of Distances: The range of trip distances is from 0.5 miles to 17 miles. This wide range implies that there are both short-distance and long-distance trips in the dataset.
- Distribution: The quartile values show that 50% of trips have a distance of 5.2 miles or less, while 75% have a distance of 8.8 miles or less. This distribution indicates that a significant portion of trips are relatively short in distance.

**B. Insights on Duration Minutes:**

- Average Trip Duration: The average duration of trips is approximately 18 minutes. This suggests that, on average, trips last for a moderate duration.
- Variability: The standard deviation of 13.15 indicates variability in trip durations around the mean. This variability could be influenced by factors such as traffic conditions or trip purposes.
- Range of Durations: Trip durations range from 1 minutes to 154 minutes. This wide range reflects the diversity in trip durations, with some trips being very short and others much longer.
- Distribution: The quartile values reveal that 50% of trips have a duration of 15 minutes or less, while 75% have a duration of 23 minutes or less. This distribution shows that a significant proportion of trips are relatively short in duration.

In [None]:
plt.figure(figsize=(18,5))
plt.subplot(1,2,1)
sns.kdeplot(x=df.duration_minutes,color='green',multiple='stack')
plt.subplot(1,2,2)
sns.boxplot(x=df.duration_minutes,color='red')
plt.show()

In [None]:
percentile2=df['duration_minutes'].quantile(0.98)
percentile2

In [None]:
df[df['duration_minutes']<percentile2].shape

In [None]:
df=df[df['duration_minutes']<percentile2]

In [None]:
plt.figure(figsize=(18,5))
plt.subplot(1,2,1)
sns.kdeplot(x=df.duration_minutes,color='green')
plt.subplot(1,2,2)
sns.boxplot(x=df.duration_minutes,color='red')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.kdeplot(df[df['distance']<percentile1]['distance'],color='green')
plt.subplot(1,2,2)
sns.boxplot(x=df[df['distance']<percentile1]['distance'])
plt.show()

# `DATA VISUALIZATION`

In [None]:
df.columns

In [None]:
col=['distance','month','hour','duration_minutes']
corr_matrix=df[col].corr()
corr_matrix

In [None]:
sns.heatmap(corr_matrix)

### **_`OBSERVATIONS:`_**

**A. Miles and Duration Minutes:**

- There is a moderate positive correlation (0.65) between the distance covered in miles and the duration of trips in minutes. This suggests that longer trips tend to take more time, which is a logical relationship.

**B.Miles and Month:**

- There is a weak negative correlation (-0.09) between trip distance in miles and the month in which the trip occurs. This implies that there is no significant relationship between the distance of trips and the month of the year.

**C. Miles and Hour:**

- There is a weak negative correlation (-0.06) between trip distance in miles and the hour of the day when the trip starts. This suggests that there is no strong relationship between trip distance and the time of day.

**D. Duration Minutes and Month:**

- There is a weak positive correlation (0.05) between trip duration in minutes and the month of the year. This implies that there is no significant relationship between trip duration and the month in which the trip occurs.

**E. Duration Minutes and Hour:**

- There is a weak negative correlation (-0.04) between trip duration in minutes and the hour of the day when the trip starts. This suggests that there is no strong relationship between trip duration and the time of day.

In [None]:
cols = ['category','purpose','month','hour','round_trip']

# Iterate through each column
for col in cols:
    plt.figure(figsize=(20,5))
    plt.subplot(1,2,1)
    sns.countplot(x=col, data=df,palette='flare')
    plt.subplot(1,2,2)
    col_counts = df[col].value_counts()
    col_counts.plot(kind='pie', labels=col_counts.index, autopct='%1.1f%%', startangle=30, colors=sns.color_palette('flare'))
    plt.tight_layout()
    plt.show()

### **_`OBSERVATIONS / INSIGHTS`_**

**A. Category Distribution:**

- 93.5% of trips are categorized as Business trips, while 6.5% are Personal trips. This indicates that the majority of trips in the dataset are for business purposes.

**B. Purpose Distribution:** 

- 15.2% of trips are for Meal/Entertainment purposes, making it one of the top purposes for trips.
- Meetings account for 15% of trips, indicating that business-related activities are significant reasons for travel.
- Errands make up 12.3% of trips, suggesting that personal tasks and activities are also common reasons for travel.

**C. Month Distribution:**

- December has the highest percentage of total rides booked, with 13.75% of trips occurring in this month. This could be due to year-end activities, holidays, or seasonal factors.
- August follows closely with 11.45% of trips, indicating a busy period during the summer months.
- November contributes 11.2% of trips, suggesting a consistent level of travel throughout the year.
- September and April have the least number of total trips, each accounting for 2.6% and 4.2% of the total, respectively.
- July and February both contribute 10% of trips, showing moderate travel activity during these months.

**D. Type of Trips:**

- One-way trips are the most common, making up 74.8% of total trips, indicating that most trips do not involve return journeys.
- Other trips, which could include round trips or specific types of travel arrangements, account for 25.2% of total trips.

In [None]:
start_point = (df['source'].value_counts() / len(df['source'])) * 100

# Filter values less than 10
filtered_start_point = start_point[start_point > 1]

# Plotting the pie chart
plt.figure(figsize=(10, 10))
filtered_start_point.plot(kind='pie', colors=sns.color_palette('flare'), autopct='%1.1f%%', fontsize=14)

# Add title and show the plot
plt.title('Distribution of Starting Points', fontsize=16)
plt.show()

In [None]:
stop_point = (df['destination'].value_counts() / len(df['destination'])) * 100

# Filter values less than 10
filtered_stop_point = stop_point[stop_point > 1]

# Plotting the pie chart
plt.figure(figsize=(10, 10))
filtered_stop_point.plot(kind='pie', colors=sns.color_palette('flare'), autopct='%1.1f%%', fontsize=14)

# Add title and show the plot
plt.title('Distribution of Stopping Points', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(18,5))
plt.subplot(1,2,1)
sns.histplot(x=df.duration_minutes,color='purple',bins=20)
plt.axvline(df.duration_minutes.mean(), color='green', linestyle='-', label=f'Mean:')
plt.axvline(df.duration_minutes.median(), color='red', linestyle='--', label=f'Median:')
plt.title("Displaying the Histogram of Trip Duration, Including Mean and Median")

plt.subplot(1,2,2)
sns.histplot(x=df.distance,color='purple',bins=15)
plt.axvline(df.distance.mean(), color='green', linestyle='-', label=f'Mean:')
plt.axvline(df.distance.median(), color='red', linestyle='--', label=f'Median:')
plt.title("Displaying the Histogram of Trip Miles, Including Mean and Median")
sns.despine()
plt.show()

In [None]:
df.describe()

### **_`OBSERVATIONS / INSIGHTS:`_**

**A.Distance:**
- The mean distance covered in miles is approximately 6 miles, with a standard deviation of around 4 miles. This indicates that the average trip distance varies moderately around the mean.
- The median distance, which represents the middle value of the dataset when arranged in ascending order, is 5.2 miles. This suggests that the distribution of trip distances is slightly skewed towards shorter trips, as the median is lower than the mean.

**B. Duration Minutes:**

- The mean duration of trips in minutes is approximately 17 minutes, with a standard deviation of around 9.8 minutes. This indicates that the average trip duration varies considerably around the mean.
- The median duration, which represents the middle value of the dataset when arranged in ascending order, is 15 minutes. Similar to distance, this suggests that the distribution of trip durations is slightly skewed towards shorter durations, as the median is lower than the mean.

In [None]:
plt.figure(figsize=(20,12))
plt.subplot(2,2,1)
sns.lineplot(x=df.distance,y=df.duration_minutes,hue=df['category'],palette=['red','green'])
plt.title("Miles vs Duration with category",fontsize=20)
plt.subplot(2,2,2)
sns.lineplot(x=df.distance,y=df.hour,hue=df['category'],palette=['red','green'])
plt.title("Miles vs Hours of the day with category",fontsize=20)
plt.subplot(2,2,3)
sns.scatterplot(x=df.distance,y=df.duration_minutes,hue=df['round_trip'],palette=['red','navy'])
plt.title("Miles vs Duration with Type of Trip",fontsize=20)
plt.subplot(2,2,4)
sns.scatterplot(x=df.distance,y=df.hour,hue=df['round_trip'],palette=['red','navy'])
plt.title("Miles vs Hours of the day with Type of Trips",fontsize=20)
sns.despine()
plt.show()

### **_`OBSERVATIONS / INSIGHTS:`_**

**A. Lineplots:**

- There is an upward trend in the relationship between miles and duration, indicating that longer distances generally result in longer trip durations.
- The dominance of the Business category in the dataset is reflected in the lineplot, where Business trips tend to have longer durations compared to Personal trips.
- The analysis of miles versus hour does not show significant fluctuations for Business trips, whereas Personal trips exhibit more variability in duration across different hours.

**B. Scatterplots:**

- The positive correlation (0.65 in the correlation matrix) between miles and durations is evident in the scatterplot, showing that trips with longer distances tend to have longer durations.
- Round trips, represented by shorter distances, generally have shorter durations compared to single trips that cover a wider range of distances and durations.

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.violinplot(y=df.distance,x=df.purpose,palette='flare')
plt.title("Exploring Trip Purposes and Miles Covered ",fontsize=20)
plt.xticks(rotation=45)
plt.subplot(1,2,2)
sns.violinplot(y=df.duration_minutes,x=df.purpose,palette='flare')
plt.title("Exploring Trip Purposes and Miles Duration",fontsize=20)
plt.xticks(rotation=45)
sns.despine()
plt.show()

### **_`OBSERVATIONS / INSIGHTS:`_**

**A. Airport/Travel:**

Trips for Airport/Travel purposes have a relatively consistent median distance, indicating that these trips are typically of similar lengths.

**B.Between Offices:** 

Between Offices trips show a significantly higher median distance compared to other purposes, suggesting that these trips often involve longer distances between office locations.

**C.Charity ($):**

The Charity trips category has a high median distance, indicating that these trips tend to cover substantial distances, likely related to charitable activities or events.

**D.Customer Visit:**

Trips for Customer Visits have a moderate median distance, suggesting that these trips are typically of moderate length when visiting customers.

**E.Errand/Supplies:**

Errand/Supplies trips have a relatively low median distance, indicating that these trips are often short and may involve nearby locations for errands or supply runs.

**F.Meal/Entertain:**

Trips for Meal/Entertainment purposes have a moderate median distance, suggesting that these trips cover a moderate distance for dining or entertainment activities.

**G.Meeting:**

Meeting trips exhibit a high median distance, indicating that these trips typically involve longer distances for business or professional meetings.

**H.Moving:**

Moving trips have a moderate median distance, suggesting that these trips cover a moderate distance when moving from one location to another.

**I.Temporary Site:**

Trips to Temporary Sites have a moderate median distance, indicating that these trips typically cover a moderate distance for temporary work or assignments.

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(x='hour',data=df,hue='round_trip',palette='flare')
plt.show()

In [None]:

grouped_counts = df.groupby(['category', 'purpose']).size().reset_index(name='frequency')
plt.figure(figsize=(15,5))
sns.barplot(x=grouped_counts.purpose,y=grouped_counts.frequency,hue=grouped_counts.category,palette='flare')
plt.show()

In [None]:
group_trips = df.groupby(['purpose','round_trip']).size().reset_index(name='frequency')
plt.figure(figsize=(15,5))
sns.barplot(x=group_trips.purpose,y=group_trips.frequency,palette='flare')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.catplot(x='category', y='duration_minutes', hue='month', col='round_trip',
            data=df, kind='box',palette='Spectral')

# Set the title and labels
# plt.title("Categorical Plot: Category vs. Duration (Categorized by Month)")
plt.xlabel("Category")
plt.ylabel("Duration (Minutes)")

plt.show()

### **_`OBSERVATIONS / INSIGHTS:`_**

- There were no Business round trips recorded during June.
- Personal round trips occurred for only 4 months and were relatively shorter in duration.
- September witnessed the highest number of Business rides without round trips, while other months had consistent average rides. In contrast, Personal rides without round trips were observed mainly in January, February, and June.

In [None]:
plt.figure(figsize=(15,4))
sns.catplot(x='category', y='distance', hue='month', col='round_trip',
            data=df, kind='box',palette='Spectral',)

# Set the title and labels
# plt.title("Categorical Plot: Category vs. Duration (Categorized by Month)")
plt.xlabel("Category")
plt.ylabel("Duration (Minutes)")

plt.show()

### **_`OBSERVATIONS / INSIGHTS:`_**

- Business rides (round trips) indicate that the most distance was covered during the third and fourth quarters of the year, while Personal rides (round trips) typically cover average distances, primarily observed in January, February, March, and June.
- Business rides (not round trips) cover distances ranging from the minimum to almost the maximum for nearly every month, whereas Personal rides (not round trips) cover moderate distances, especially noticeable in January, February, and June.

In [None]:
plt.figure(figsize=(15,4))
sns.catplot(x='hour', y='duration_minutes', col='round_trip',
            data=df, kind='box', palette='Spectral')

# Set the title and labels
# plt.title("Categorical Plot: Hour vs. Duration (Categorized by Round Trip)")
plt.xlabel("Hour")
plt.ylabel("Duration (Minutes)")

plt.show()

In [None]:
plt.figure(figsize=(15,4))
sns.catplot(x='hour', y='distance', col='round_trip',
            data=df, kind='box', palette='Spectral')

# Set the title and labels
# plt.title("Categorical Plot: Hour vs. Miles (Categorized by Round Trip)")
plt.xlabel("Hour")
plt.ylabel("Miles")

plt.show()

# `NON-GRAPHICAL REPRESENTATION`

In [None]:
df.groupby('purpose')['distance'].describe()

In [None]:
df.groupby('purpose')['duration_minutes'].describe()

In [None]:
df.groupby('round_trip')['duration_minutes'].describe()

In [None]:
df.groupby('round_trip')['distance'].describe()

In [None]:
df.groupby('month')['distance'].describe()

In [None]:
df.groupby('month')['duration_minutes'].describe()

In [None]:
df.groupby('category')['duration_minutes'].describe()

In [None]:
df.groupby('category')['distance'].describe()

# `CONCLUSION:`

- Consider adjusting fare charges to be slightly higher during low hours, from midnight to 12 PM, as peak hours from 12 PM to midnight experience a higher volume of rides.
- The Business rides category is predominant, covering longer distances and durations. Implementing schemes to enhance the profitability of these trips could be beneficial.
- For trips where the start and end points are the same (25% of trips), charges should include waiting time if applicable.