# Google Capstone Project

In [None]:
#Load the necessary libraries that will be utilized for the project
library(tidyverse)
library(lubridate)
library(janitor)
library(dplyr)
library(ggplot2)

### Load all the data, as well as combine every dataset

In [None]:
trip20_Jan <- read.csv("../input/divvytrips2021/202101-divvy-tripdata.csv") #load data into R
trip20_Feb <- read.csv("../input/divvytrips2021/202102-divvy-tripdata.csv")
trip20_Mar <- read.csv("../input/divvytrips2021/202103-divvy-tripdata.csv")
trip20_Apr <- read.csv("../input/divvytrips2021/202104-divvy-tripdata.csv")
trip20_May <- read.csv("../input/divvytrips2021/202105-divvy-tripdata.csv")
trip20_Jun <- read.csv("../input/divvytrips2021/202106-divvy-tripdata.csv")
trip20_Jul <- read.csv("../input/divvytrips2021/202107-divvy-tripdata.csv")
trip20_Aug <- read.csv("../input/divvytrips2021/202108-divvy-tripdata.csv")
trip20_Sep <- read.csv("../input/divvytrips2021/202109-divvy-tripdata.csv")
trip20_Oct <- read.csv("../input/divvytrips2021/202110-divvy-tripdata.csv")
trip20_Nov <- read.csv("../input/divvytrips2021/202111-divvy-tripdata.csv")
trip20_Dec <- read.csv("../input/divvytrips2021/202112-divvy-tripdata.csv")

### Combine every dataset to consolidate analysis 


In [None]:
trips20fill<- rbind(trip20_Jan, trip20_Feb, trip20_Mar, trip20_Apr, trip20_May, trip20_Jun, trip20_Jul, trip20_Aug, trip20_Sep, trip20_Oct, trip20_Nov, trip20_Dec)


### View newly created dataset

In [None]:
View(trips20fill) #combine all datasets 

### Firstly remove all the irrelevent columns that won't be used for analysis 

In [None]:
trips20fill <- trips20fill %>%  
  select(-c(start_lat, start_lng, end_lat, end_lng, start_station_id,end_station_id, end_station_name))

### Review of the data and its parameters. 

In [None]:
colnames(trips20fill)  #List of column names
nrow(trips20fill)  #How many rows are in data frame?
dim(trips20fill)  #Dimensions of the data frame?
head(trips20fill, 6)  #See the first 6 rows of data frame.  Also tail(all_trips)
str(trips20fill)  #See list of columns and data types (numeric, character, etc)
summary(trips20fill) #inspect the date and its dimensions before moving onto cleaning

### Additional columns must be created for date and time.

In [None]:
#The default format is yyyy-mm-dd
trips20fill$date <- as.Date(trips20fill$started_at)
trips20fill$month <- format(as.Date(trips20fill$date), "%m")
trips20fill$day <- format(as.Date(trips20fill$date), "%d")
trips20fill$year <- format(as.Date(trips20fill$date), "%Y")
trips20fill$day_of_week <- format(as.Date(trips20fill$date), "%A")
trips20fill$time <- format(trips20fill$started_at, format= "%H:%M")
trips20fill$time <- as.POSIXct(trips20fill$time, format= "%H:%M")

### Calculated filed that shows the time of each unique ride 

In [None]:
#create calculated field to isolate time spent on every ride.
trips20fill$ride_length <- (as.double(difftime(trips20fill$ended_at, trips20fill$started_at))) /60

### Check data structure. Confirm data types for time/date 

In [None]:
str(trips20fill) #confirm data type is double [True]

### Alter data type for time 

In [None]:
trips20fill$ride_length <- as.numeric(as.character(trips20fill$ride_length)) #change datatype to numeric for further analysis

### Remove all blank entries from the dataset 

In [None]:
trips20fill<- trips20fill[!(trips20fill$start_station_name == "HQ QR" | trips20fill$ride_length<0),]

### Observe the newly created column for the backup dataset

In [None]:
summary(trips20fill$ride_length)

## Analyze data 




### Calculating the mean, median, max, min - figures to determine statisical spead of membership type

In [None]:
aggregate(trips20fill$ride_length ~ trips20fill$member_casual, FUN = mean)
aggregate(trips20fill$ride_length ~ trips20fill$member_casual, FUN = median)
aggregate(trips20fill$ride_length ~ trips20fill$member_casual, FUN = max)
aggregate(trips20fill$ride_length ~ trips20fill$member_casual, FUN = min)


### Order day's of week within new dataset for future use

In [None]:
trips20fill$day_of_week <- ordered(trips20fill$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

### Create a weekday field as well as view column specifics 

In [None]:
trips20fill %>% 
  mutate(day_of_week = wday(started_at, label = TRUE)) %>%  #creates weekday field using wday()
  group_by(member_casual, day_of_week ) %>%  #groups by usertype and weekday
  summarise(number_of_rides = n())

# Data Visualiation's

In [None]:
trips20fill$day_of_week  <- format(as.Date(trips20fill$date), "%A")
trips20fill %>%                              #total rides broken down by weekday
  group_by(customer_type, day_of_week) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(customer_type, day_of_week) %>%
  ggplot(aes(x = day_of_week, y = number_of_rides, fill = customer_type)) + geom_col(position = "dodge") + 
  labs(x='Day of Week', y='Total Number of Rides', title='Rides per Day of Week', fill = 'Type of Membership') + 
  scale_y_continuous(breaks = c(250000, 400000, 550000), labels = c("250K", "400K", "550K"))

#### The rides per day of week show casual riders peak on the Saturday and Sunday while members peak Monday through Friday. This indicates members mainly use the bikes for their commutes and not leisure. 

In [None]:
trips20fill %>%   #total rides broken down by month
  group_by(member_casual, month) %>%  
  summarise(total_rides = n(),`average_duration_(mins)` = mean(ride_length)) %>% 
  arrange(member_casual) %>% 
  ggplot(aes(x=month, y=total_rides, fill = member_casual)) + geom_col(position = "dodge") + 
  labs(x= "Month", y= "Total Number of Rides", title = "Rides per Month", fill = "Type of Membership") + 
  scale_y_continuous(breaks = c(100000, 200000, 300000, 400000), labels = c("100K", "200K", "300K", "400K")) + theme(axis.text.x = element_text(angle = 45))

#### The rides per month show that casual riders were a lot more active during the summer months than the long-term. Conversly, the winter months show very little activity on the part of the casual users. The long-term users are more active in the winter and spring months.

In [None]:
trips20fill %>%    #looking at breakdown of bike types rented
  ggplot(aes(x = rideable_type, fill = member_casual)) + geom_bar(position = "dodge") + 
  labs(x= 'Type of Bike', y='Number of Rentals', title='Which bike works the most', fill = 'Type of Membership') +
  scale_y_continuous(breaks = c(500000, 1000000, 1500000), labels = c("500K", "1Mil", "1.5Mil"))

#### The breakdown of which type of bike is the most popular among either type of user. Showing among the two types of bikes classic and electric. both types of memberships prefer using the classic bike more so than the electric bike. The long-term memebrs are also seen to be of the two types favours the classic bike.

In [None]:
trips20fill %>%        #Find the average time spent riding by each membership type per individul day
  mutate(day_of_week = wday(started_at, label = TRUE)) %>%  
  group_by(member_casual, day_of_week) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, day_of_week)  %>% 
  ggplot(aes(x = day_of_week, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge") + labs(x='Days of the week', y='Average duration - Hrs', title='Average ride time per week', fill='Type of Membership') 

#### The average ride time shows a stark difference between the casuals and members. Casuals overall spend more time using the service than their full time member counter-parts.

# what does the data tell us?

## key takeaways
- Casual users tended to ride more so in the warmer months of Chicago, namely June- August. Their participation exceeded that of the long term members.
- To further that the Casual demographic spent on average a lot longer time per ride than their long-term counter-parts. 
- The days of the week also further shows that causal riders prefer to use the service during the weekends as their usage peaked then. The long term members conversly utilised the service more-so throughout the typical work week i.e (Monday- friday)
- Long term riders tended to stick more so to classic bikes as opposed to the docked or electric bikes.


 