In [27]:
%matplotlib
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File to Load (Remember to change these)
city_data_to_load = "data/city_data.csv"
ride_data_to_load = "data/ride_data.csv"

# Read the City and Ride Data
city_data = pd.read_csv(city_data_to_load)
ride_data = pd.read_csv(ride_data_to_load)

# Combine the data into a single dataset
pyber_data = pd.merge(ride_data, city_data, how="left", on=["city", "city"])

# Display the data table for preview
pyber_data.head()

Using matplotlib backend: Qt5Agg


Unnamed: 0,city,date,fare,ride_id,driver_count,type,color
0,Lake Jonathanshire,2018-01-14 10:14:22,13.83,5739410935873,5,Urban,red
1,South Michelleport,2018-03-04 18:24:09,30.24,2343912425577,72,Urban,red
2,Port Samanthamouth,2018-02-24 04:29:00,33.44,2005065760003,57,Urban,red
3,Rodneyfort,2018-02-10 23:22:03,23.44,5149245426178,34,Urban,red
4,South Jack,2018-03-06 04:28:35,34.58,3908451377344,46,Urban,red


In [28]:
# Let's check for any missing variables
pyber_data.count()

# Nothing to worry about - all values represented

city            2375
date            2375
fare            2375
ride_id         2375
driver_count    2375
type            2375
color           2375
dtype: int64

In [29]:
# let's confirm data startistics
# pyber_data.describe()

# Average fare = $26.75, Avg. Driver count = 30

In [30]:
# Let's confirm data types
# pyber_data.dtypes

In [31]:
# Remove unwanted columns
pdr_df = pyber_data.loc[:, ["type", "city", "fare", "driver_count", "color"]]
pdr_df.head(10)

Unnamed: 0,type,city,fare,driver_count,color
0,Urban,Lake Jonathanshire,13.83,5,red
1,Urban,South Michelleport,30.24,72,red
2,Urban,Port Samanthamouth,33.44,57,red
3,Urban,Rodneyfort,23.44,34,red
4,Urban,South Jack,34.58,46,red
5,Urban,South Latoya,9.52,10,red
6,Urban,New Paulville,43.25,44,red
7,Urban,Simpsonburgh,35.98,21,red
8,Urban,South Karenland,35.09,4,red
9,Urban,North Jasmine,42.81,33,red


In [32]:
# Calculate number of rides per city
total_rides = pdr_df.groupby(["city"]).count()["fare"]
# total_rides.head()

In [33]:
# Calc avg. fare price per city
avg_fair = pdr_df.groupby(["city"]).mean()["fare"]
# avg_fair.head()

In [34]:
# Display driver counts
driver_count = pdr_df.groupby(["city"]).max()["driver_count"]
# driver_count.head()

In [35]:
# Add city type
type = pdr_df.groupby(["city"]).max()["type"]
# type.head()

In [36]:
# groupby city for color
color = pdr_df.groupby(["city"]).max()["color"]
# color.head()

In [37]:
scat_df = pd.DataFrame({"Total Rides": total_rides,
                       "Average Fair": avg_fair,
                       "Driver Count": driver_count,
                       "Community": type,
                       "Color": color})

scat_df.index.name = None

scat_df = scat_df[["Total Rides", "Average Fair", "Driver Count", "Community", "Color"]]

scat_df.head()
    

Unnamed: 0,Total Rides,Average Fair,Driver Count,Community,Color
Amandaburgh,18,24.641667,12,Urban,red
Barajasview,22,25.332273,26,Urban,red
Barronchester,16,36.4225,11,Suburban,green
Bethanyland,18,32.956111,22,Suburban,green
Bradshawfurt,10,40.064,7,Rural,blue


In [38]:
# Determine unique color sets
color_unique = np.unique(color)
color_unique

array(['blue', 'green', 'red'], dtype=object)

## Bubble Plot of Ride Sharing Data

In [39]:
# Obtain the x and y coordinates for each of the three city types
# Build the scatter plots for each city and each city type


fig, ax = plt.subplots()
for x in color_unique:
    ax.scatter(x=scat_df["Total Rides"],
               y=scat_df["Average Fair"],
               s=scat_df["Driver Count"]*20,   # Use to change scale of bubbles
               color=scat_df["Color"],
               label=np.unique(type),
               alpha = 0.4,
               edgecolors="white",
               linewidths=1.5)
ax.legend
ax.grid(True)
   

# Incorporate the other graph properties
# Add titles and labels
plt.title("Pybar Ride Sharing Data (2016)")
plt.xlabel('Total Number of Rides')
plt.ylabel('Average Fare($)')
plt.legend(loc='best')
plt.show()

In [40]:
# save plot to file
plt.savefig("../Images/PyberPlot")

## Total Fares by City Type

In [41]:
# Calculate Type Percents
# Build Pie Chart
# Save Figure
# Data to plot
labels = pdr_df.groupby(["color"]).max()["type"]
avg_fair_pie = pdr_df.groupby(["color"]).sum()["fare"]
color_pie = pdr_df.groupby(["type"]).max()["color"]
explode = (0, 0, 0.1) 

fig1, ax1 = plt.subplots()
ax1.pie(avg_fair_pie, explode=explode, labels=labels, colors=color_pie, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title("% Total Fares by City Type")
plt.show()

In [42]:
# Save
plt.savefig("../Images/PiesFare")

## Total Rides by City Type

In [43]:
# Calculate Ride Percents
labels = pdr_df.groupby(["color"]).max()["type"]
ttl_rides_pie = pdr_df.groupby(["color"]).count()["fare"]
color_pie = pdr_df.groupby(["type"]).max()["color"]
explode = (0, 0, 0.1) 

fig1, ax1 = plt.subplots()
ax1.pie(ttl_rides_pie, explode=explode, labels=labels, colors=color_pie, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("% Total Rides by City Type")
plt.show()
# Build Pie Chart

# Save Figure


In [44]:
# Save
plt.savefig("../Images/PiesTotalRides")

## Total Drivers by City Type

In [46]:
# Calculate Driver Percents
labels = pdr_df.groupby(["color"]).max()["type"]
ttl_drivers_pie = pdr_df.groupby(["color"]).sum()["driver_count"]
color_pie = pdr_df.groupby(["type"]).max()["color"]
explode = (0, 0, 0.15) 

fig1, ax1 = plt.subplots()
ax1.pie(ttl_drivers_pie, explode=explode, labels=labels, colors=color_pie, autopct='%1.1f%%',
        shadow=True, startangle=120)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("% Total Drivers by City Type")
plt.show()

plt.savefig("../Images/PiesTotalDrivers")

In [47]:
# Save
#plt.savefig("../Images/PiesTotalDrivers")