# Cyclistic Bikeshare Case Study

## #Import the necessary libraries

In [1]:
import pandas as pd               # data manipulation and analysis
import numpy as np                # efficient data types
import matplotlib.pyplot as plt   # plotting visuals
import seaborn as sns             # visualization module
import glob                       # pathnames matching
import os                         # data file path handling
import datetime                   # For date manipulation
import math                       # For math functions
import h3                         # To calculate distance

In [None]:
# Import all csv files and concatenate them into one dataframe.

path = r'C:\Users\Victor Muange\Desktop\Cyclistic Case Study\Bikeshare dataset Apr-2020 to Mar-2021'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
Main_df = pd.concat(df_from_each_file, ignore_index = True)

## Exploring the data for cleaning.

In [None]:
# Prints the top five rows of the data
Main_df.head()

In [None]:
# Check the number of raws in the dataframe.
len(Main_df.index)

In [None]:
# Get information from our dataframe (number of records, memory use and data types)
Main_df.info(memory_usage = 'deep')

In [None]:
# Get descriptive statistics under each numeric column
Main_df.describe().apply(lambda s: s.apply('{0:.3f}'.format))

# A. Data cleaning

## Checking for duplicate values

In [None]:
# checking the number of unique values per column
print("Unique values per column")
for col in Main_df.columns:
    print(f"{col}: {Main_df[col].nunique()}")

### Findig the duplicates for ride_id

In [None]:
def get_duplicates(field: str) -> pd.DataFrame:
    """Display rows with duplicate ride_ids"""
    vc = Main_df[field].value_counts()
    duplicate_index = vc[vc > 1].index.to_list()
    n_duplicates = len(duplicate_index)
    print(f"Number of duplicates for {field}: {n_duplicates}")
    if n_duplicates > 0:
        print("Sample rows:")
        mask = Main_df[field].isin(duplicate_index)
        return Main_df[mask].sort_values(by = field).head()
    return

get_duplicates('ride_id')

## Drop duplicate ride_id

In [None]:
Main_df = Main_df.drop_duplicates(subset=['ride_id'])

In [None]:
# Confirm that duplicate ride_ids were dropped
get_duplicates('ride_id')

# Check for NaN values

In [None]:
# Check for NaN values in start_station_id
Main_df['start_station_id'].isna().sum()

## Drop NaN values in start_station_id

In [None]:
Main_df = Main_df.dropna(subset=['start_station_id']) 

In [None]:
#Confirm that NaN values were dropped.
Main_df['start_station_id'].isna().sum()

In [None]:
# Check for NaN values in end_station_id
Main_df['end_station_id'].isna().sum()

## Drop NaN values in end_station_id

In [None]:
Main_df = Main_df.dropna(subset=['end_station_id'])

In [None]:
#Confirm that NaN values were dropped.
Main_df['end_station_id'].isna().sum()

# B. Data Manipulation.
### Adding the necessary columns

## 1. Add a ride_duration column 

In [None]:
# convert time object to datetime

Main_df['started_at'] = Main_df['started_at'].astype('datetime64[ns]')
Main_df['ended_at'] = Main_df['ended_at'].astype('datetime64[ns]')

In [None]:
# Adds ride_duration column
Main_df['ride_duration_mins'] = (((Main_df['ended_at'] - Main_df['started_at']))/pd.Timedelta(minutes=1))

### 2. Add a day column to show which day the ride starts

In [None]:
Main_df['day'] = pd.to_datetime(Main_df['started_at']).dt.day_name()

### 3. Add a month_year column

In [None]:
Main_df['month'] = pd.to_datetime(Main_df['started_at']).dt.strftime('%B-%Y')

### 4. Add a ride_distance column
#### Subtract start and end coordinates

In [None]:
Main_df['ride_distance'] = Main_df.apply(lambda row: h3.point_dist((row['start_lat'], row['start_lng']), (row['end_lat'], row['end_lng']), unit='km'), axis=1)

In [None]:
# Count number of rows where ride_distance is equal to or less than zero

(Main_df['ride_distance'] <=0).sum()  .sum()

In [None]:
# Count total rows in Main_df
len(Main_df.index)

In [None]:
# Drop the zero and negative vales in ride_distance
Main_df = Main_df.drop(Main_df[(Main_df['ride_distance'] <=0)].index)

In [None]:
#Check that there are no more zero and negative values in ride_distance
(Main_df['ride_distance'] <=0).sum()  .sum()

In [None]:
# Count total rows in Main_df
len(Main_df.index)

### 5. Add a start_hour column
It helps determine what time of day most riders get the bikes

In [None]:
Main_df['start_hour'] = Main_df['started_at'].dt.hour

In [None]:
Main_df.head()

### Extract the seasons from started_at

In [None]:
Main_df['season'] = (Main_df['started_at'].dt.month%12 + 3)//3

seasons = {
             1: 'Winter',
             2: 'Spring',
             3: 'Summer',
             4: 'Autumn'
}

Main_df['season'] = Main_df['season'].map(seasons)

In [None]:
Main_df.head()

## #Now the data is clean, we have all the columns we need, and it is ready for analysis.

### Note: At this stage, you can visualize the data using Python, or you can export your data as csv and visualize using Tableau.

In [None]:
Viz_1 = Main_df.groupby(['member_casual']).count()['ride_id']

In [None]:
# Plot a pie chart that shows the percentage of total rides between casual riders and members.
print("Members: 1,857,313",
     "\nCasuals: 1,150,000")
plot = Viz_1.plot.pie(autopct='%1.0f%%', figsize=(5, 5))

In [None]:
Viz_2 = Main_df.groupby(['day', 'member_casual']).count()['ride_id']

In [None]:
#Viz_3 = Main_df.groupby(['start_hour']).count()['ride_id']
#Viz_3

In [None]:
#plt.plot(Viz_3['ride_id'].count(), Viz_3['start_hour'])

In [None]:
#Viz_4 = Main_df.groupby(['day']).count()['ride_id']

In [None]:
#x = Viz_4['day']
#y = Viz_4['day'])
#plt.bar(Viz_4['day']), )
#plt.show()

In [None]:
#Main_df.head()