# Getting started with Project

## Import libraries

In [2]:
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import seaborn as sns
import matplotlib.pyplot as plt

### Data

In [3]:
CITY_DATA = {'chicago': 'chicago.csv',
      'new york city': 'new_york_city.csv',
      'washington': 'washington.csv'}

In [4]:
# Initializing dictionary for storing cleaned data
cleaned_city_data={}

### Cleaning Data

In [18]:
# Function to process and clean data
def cleanData():
    """
    Cleans and processes raw bikeshare data for each city.

    - Removes unnecessary columns.
    - Converts time columns to datetime format.
    - Adds new columns like Month, Day, Start time, End time, and Age.
    - Handles missing values for Gender and Birth Year.
    - Saves cleaned data into new CSV files.

    Returns: A dictionary with city names as keys and paths to their cleaned CSV files as values.
    """

    for city, filepath in CITY_DATA.items():
        print(f"\nProcessing data for: {city.title()} ({filepath})")

        # Loading file
        df = pd.read_csv(filepath) 
 
         # Dropping unwanted file i.e: Unnamed: 0
        df.drop(columns='Unnamed: 0', inplace=True) 

        #  Converting time data from objects to standard time type
        df['Start Time'] = pd.to_datetime(df['Start Time'])  
        df['End Time'] = pd.to_datetime(df['End Time'])

        # Adding new features to data i.e: month, day, start time and end time
        df['Month'] = df['Start Time'].dt.month_name().str.lower()
        df['Day'] = df['Start Time'].dt.day_name().str.lower()
        df['Start time'] = df['Start Time'].dt.time
        df['End time'] = df['End Time'].dt.time

        # Making day col Starndard
        df['Day'] = df['Day'].apply(lambda x: "monday" if x == 0 else "tuesday" if x == 1 else "wednesday" if x == 2 else 
                                    "thursday" if x == 3 else "friday" if x == 4 else "saturday" if x == 5 else "sunday" if x == 6 else x)

        # Converting columns gender and birth year to string object
        if('Gender' not in df and 'Birth Year' not in df):
            df['Gender'] = 'N/A'
            df['Birth Year'] = 'N/A'
        else:
            df['Gender'] = df['Gender'].astype(str).str.lower()
            df['Birth Year'] = df['Birth Year'].astype(str).str.lower()

        # Dropping unwanted file since, it has no further use
        df.drop(columns='Start Time', inplace=True)
        df.drop(columns='End Time', inplace=True)

        # Adding a new column i.e: Age
        df['Birth Year'] = pd.to_numeric(df['Birth Year'], errors='coerce')
        df['Age'] = df['Birth Year'].apply(lambda x: 0 if pd.isnull(x) else 2017-int(x))
            
        # Save cleaned data to new CSV file
        cleaned_filepath = f"{city.replace(' ', '_')}_cleaned.csv"
        df.to_csv(cleaned_filepath, index=False)

        # Add city and cleaned CSV path to dictionary
        cleaned_city_data[city] = cleaned_filepath
            
    return cleaned_city_data

### Filtering and Loading Data

In [16]:
# Filtering data as per user need
def get_filters():
        """
        Asks user to specify a city, month, and day to analyze.
    
        Returns:
            (str) city - name of the city to analyze
            (str) month - name of the month to filter by, or "all" to apply no month filter
            (str) day - name of the day of week to filter by, or "all" to apply no day filter
        """
    
        print('Hello! Let\'s explore some US bikeshare data!')
        
        # Getting user input for city i.e: chicago, new_york_city_cleaned and washington
        cities = ['chicago', 'new york city', 'washington']
        while True:
            city = input("\nEnter city name (chicago, new york city, washington): ").strip().lower()
            if city in cities:
                break
            else:
                print("Invalid input. Please enter a valid city name.")
    
        # Getting user input for month (all, january, february, ... , june)
        months = ['january', 'february', 'march', 'april', 'may', 'june', 'all']
        while True:
            month = input("\nEnter month (january, february, ....., june or 'all'): ").strip().lower()
            if month in months:
                break
            else:
                print("Invalid input. Please enter a valid month.")
    
    
        # Getting user input for day of week (all, monday, tuesday, ... sunday)
        days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'all']
        while True:
            day = input("\nEnter day of week (monday, tuesday, ...., sunday or 'all'): ").strip().lower()
            if day in days:
                break
            else:
                print("Invalid input. Please enter a valid day.")
    
        print('-'*40)
        return city, month, day

In [19]:
# Loading cleaned data for EDA(Exploratory Data Analysis)
def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - Pandas DataFrame containing city data filtered by month and day
    """
    
    # Loading data based on city
    df = pd.read_csv(cleaned_city_data[city])
    select_df = df
    
    # loading data based on month & day
    if(month!='all'):
        select_df = df[df['Month'].isin([month])]

    if(day!='all'):
        select_df = select_df[select_df['Day'].isin([day])]
    
    return select_df

### Calculating and Printing Stats

In [21]:
# Caculating and printing time stats
def time_stats(df):
    """Displays statistics on the most frequent times of travel."""

    print('\nCalculating The Most Frequent Times of Travel...\n')
    # start time of the function/calculation
    start_time = time.time()
    formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
    print(f"Code started executing at: {formatted_time}")

    select_df = df
    
    # display the most common month
    common_month = select_df['Month'].value_counts().idxmax()
    
    # display the most common day of week
    common_week = select_df['Day'].value_counts().idxmax()
    
    # display the most common start hour
    common_starthr = select_df['Start time'].value_counts().idxmax()

    # display most common end hour
    common_endhr = select_df['End time'].value_counts().idxmax()

    # Formatting 
    s_hours, s_minutes, s_seconds = common_starthr.split(':')
    e_hours, e_minutes, e_seconds = common_endhr.split(':')
    
    s_formatted_time = f"{s_hours}hr" + \
                 (f" {s_minutes}min" if int(s_minutes) > 0 else "") + \
                 (f" {s_seconds}sec" if int(s_seconds) > 0 else "")
    
    e_formatted_time = f"{e_hours}hr" + \
                 (f" {e_minutes}min" if int(e_minutes) > 0 else "") + \
                 (f" {e_seconds}sec" if int(e_seconds) > 0 else "")
    
    print(f"Most Common Month        : {common_month.title()}")
    print(f"Most Common Day of Week  : {common_week.title()}")
    print(f"Most Common Start Hour   : {s_formatted_time}")
    print(f"Most Common End Hour     : {e_formatted_time}")


    duration = start_time-time.time()
    print(f"Code took: {duration:.2f} seconds")
    print('-'*40)

In [22]:
# Calculating and printing time stats
def station_stats(df):
    """Displays statistics on the most popular stations and trip."""

    print('\nCalculating The Most Popular Stations and Trip...\n')
    # Start time of function/calculation
    start_time = time.time()
    print(start_time)
    
    select_df = df
    
    # display most commonly used start station
    common_startSt = select_df['Start Station'].value_counts().idxmax()
    
    # display most commonly used end station
    common_EndSt = select_df['End Station'].value_counts().idxmax()
    
    # display most frequent combination of start station and end station trip
    common_trip = df[['Start Station', 'End Station']].value_counts().idxmax()

    print(f"Most Common Start Station        : {common_startSt}")
    print(f"Most Common End Station          : {common_EndSt}")
    print(f"Most Frequent Trip Combination   : {common_trip[0]}  TO  {common_trip[1]}")

    duration = start_time-time.time()
    print(f"Code took: {duration:.2f} seconds")
    print('-'*40)

In [23]:
# Calculating and printing trip_duration_stats
def trip_duration_stats(df):
    """Displays statistics on the total and average trip duration."""
    
    print('\nCalculating Trip Duration...\n')

    select_df = df

    # Total and average duration
    total_duration = select_df['Trip Duration'].sum()
    mean_duration = select_df['Trip Duration'].mean()

    # Min and max durations
    min_duration = select_df['Trip Duration'].min()
    max_duration = select_df['Trip Duration'].max()

    # Average duration by user type
    avg_trip_user_type = df.groupby('User Type')['Trip Duration'].mean()

    print(f"Total Travel Time     : {format_duration(total_duration)}")
    print(f"Average Travel Time   : {format_duration(mean_duration)}")
    print(f"Minimum Travel Time   : {format_duration(min_duration)}")
    print(f"Maximum Travel Time   : {format_duration(max_duration)}")

    if 'Subscriber' in avg_trip_user_type:
        print(f"Average Trip by Subscriber : {format_duration(avg_trip_user_type['Subscriber'])}")
    if 'Customer' in avg_trip_user_type:
        print(f"Average Trip by Customer   : {format_duration(avg_trip_user_type['Customer'])}")

In [24]:
# Helper function to format duration in seconds into 'X days Y hr Z min'
def format_duration(seconds):
    """This function help in formatting time into standard format"""
    
    duration = timedelta(seconds=int(seconds))
    days = duration.days
    hours = (seconds % 86400) // 3600
    minutes = (seconds % 3600) // 60

    time_str = (f"{days} days" if days > 0 else "") + \
               (f" {hours:.0f} hr" if hours > 0 else "") + \
               (f" {minutes:.0f} min" if minutes > 0 else "")
    
    return time_str.strip()

In [25]:
# Calculating and printing User stats
def user_stats(df):
    """Displays statistics on bikeshare users."""
    
    print('\nCalculating User Stats...\n')
    start_time = time.time()
    # print(start_time)
    select_df=df
    # Display earliest, most recent, and most common year of birth
    
    # Display counts of user types
    user_types = select_df['User Type'].value_counts()
        
    # Display counts of gender
    gender_count = select_df['Gender'].value_counts()

    # Display avg age
    avg_age = select_df['Age'].mean()
        
    print(f"User Types: Subscriber: {user_types.values[0] if len(user_types) > 0 else 'No Data'}, Customer: {user_types.values[1] if len(user_types) > 1 else 'No Data'}")
    
    print(f"Gender Counts: {'No Data Available' if gender_count.empty else f'Male: {gender_count[0]}, Female: {gender_count[1]}'}")

    print(f"Average age: {avg_age:.0f}")

    if(select_df['Birth Year'].isnull().all()):
        
         print("\nBirth Year Statistics: Not Availabel")
    else:
         # Most earliest Job
        earliest_yob = select_df['Birth Year'].min()

        # Most recent year of birth
        most_recent_yob = select_df['Birth Year'].max()
    
        # Most common year of birth
        most_common_yob = select_df['Birth Year'].mode()[0]

        print("\nBirth Year Statistics:")
        print(f"Earliest Year of Birth      : {int(earliest_yob)}")
        print(f"Most Recent Year of Birth   : {int(most_recent_yob)}")
        print(f"Most Common Year of Birth   : {int(most_common_yob)}")

    duration = start_time-time.time()
    print(f"Code took: {duration:.2f} seconds")
    print('-'*40)

### Data Visualization for more Insights

In [26]:
# Visual stats for a city and any particular month and particular day
def showStats(city, month, day):
    """
    Displays visual statistical insights for a selected city, month, and day.

    - Shows top 10 most frequent start-end station pairs.
    - Compares trip duration by user type (Subscribers vs Customers).

    Args:
        city (str): Name of the city.
        month (str): Selected month.
        day (str): Selected day.
    """
    df = load_data(city, month, day)

    print(f"\n Showing Statistical Insights for:\n City: {city.title()}, Month: {month.title()}, Day: {day.title()}\n")
    
    print('_'*50)
    # Plot top N busiest stations on that day
    top_stations = df[['Start Station', 'End Station']].value_counts().head(10)
    
    station_labels = [f"{start} → {end}" for start, end in top_stations.index]
    
    sns.barplot(x=top_stations.values, y=station_labels, palette='viridis')
    plt.title('Top 10 Most Frequent Station Pairs\n')
    plt.xlabel('Trip Count')
    plt.ylabel('Station Pair')
    plt.show()

    # Trip duration vs User type
    sns.barplot(x=df['User Type'], y=df['Trip Duration'], palette='viridis')
    plt.title('How many subscribers spend time riding vs customers\n')
    plt.xlabel('User type')
    plt.ylabel('Trip Duration(sec)')
    plt.show()

    print('_'*50)
    

In [27]:
# Visual stats for a city and a month wrt all day
def showStatsDay(city, month, day):
    """
    Displays visual statistical insights for a selected city and month across all days.

    - Shows average trip duration per day.
    - Displays a comparison of user types (Subscribers vs Customers) per day.

    Args:
        city (str): Name of the city.
        month (str): Selected month.
        day (str): Selected day (used for filtering but the plots consider all days).
    """
    
    df = load_data(city,month,day)
    print(f"\n Showing Statistical Insights for:\n City: {city.title()}, Month: {month.title()}, Day: {day.title()}\n")

    print('_'*50)
    sns.barplot(x=df['Day'], y=df['Trip Duration'], palette='viridis')
    plt.title('Usage according to day\n')
    plt.xlabel('Days')
    plt.ylabel('Trip Duration(sec)')
    plt.show()

        # Plot grouped bar plot
    sns.countplot(data=df, x='Day', hue='User Type')
    
    # Title and labels
    plt.title('User Type vs Day')
    plt.xlabel('Day of the Week')
    plt.ylabel('Number of Trips')
    plt.legend(title='User Type')
    plt.xticks(rotation=0)
    plt.show()

    print('_'*50)

In [28]:
# Visual stats for a city and a day wrt all month
def showStatsMonth(city, month, day):
    """
    Displays visual statistical insights for a selected city and day across all months.

    - Shows the number of trips per month to highlight seasonal usage patterns.
    - Compares user types (Subscribers vs Customers) for each month.

    Args:
        city (str): Name of the city.
        month (str): Selected month (used for filtering but the plots consider all months).
        day (str): Selected day.
    """
    
    df = load_data(city,month,day)
    print(f"\n Showing Statistical Insights for:\n City: {city.title()}, Month: {month.title()}, Day: {day.title()}\n")
    
    print('_'*50)
    # Maximum isage of bikeshare system wrt month
    sns.countplot(data=df, x='Month')
    plt.title('Seasonal Patterns – Trips per Month')
    plt.xlabel('Month')
    plt.ylabel('Trip Count')
    plt.xticks(rotation=45)
    plt.show()

    # User type trend
    sns.countplot(data=df, x='Month', hue='User Type')
    plt.title('User Type Trends Over Months')
    plt.xlabel('Month')
    plt.ylabel('Trip Count')
    plt.legend(title='User Type')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    print('_'*50)

In [29]:
# main function
def main():
    """
    Main function to run the US Bikeshare data analysis program.

    - Cleans raw bikeshare data for all cities.
    - Asks user to filter data by city, month, and day.
    - Shows various statistics like time, station usage, trip duration, and user demographics.
    - Optionally displays visual statistical plots for deeper insights.
    - Optionally displays raw data in chunks of 5 rows.
    - Offers the option to restart the analysis.

    Final output: Insightful statistics and optional visualizations for bikeshare data and raw data veiw.
    """
    cleanData()
    
    while True:
        city, month, day = get_filters()
        df = load_data(city, month, day)
    
        time_stats(df)
        station_stats(df)
        trip_duration_stats(df)
        user_stats(df)
        
        while True:
            want_VisualStats = input('\nWould you like to see some Statistical plots for more Insights? Enter yes or no:\n').strip().lower()
        
            if want_VisualStats == 'yes':
                city, month, day = get_filters()
        
                if day == 'all' and month == 'all':
                    print("Please select either a city or a month or both — not all for both.")
                else:
                    if day != 'all' and month != 'all':
                        showStats(city, month, day)
                    elif day == 'all' and month != 'all':
                        showStatsDay(city, month, day)
                    elif day != 'all' and month == 'all':
                        showStatsMonth(city, month, day)
                    else:
                        print("**Available data is insufficient**")
        
            elif want_VisualStats == 'no':
                break
        
            else:
                print("Invalid input! Please enter 'yes' or 'no'. Try again.")

            
        while True:
            i=5
            raw_data = input('\nWould you like to see rew data? Enter yes or no.\n')
            if raw_data.lower() == "yes":
                print(df.head(i))
                i+=5
            elif raw_data.lower() == "no":
                print("Thank You")
                break
        
        restart = input('\nWould you like to restart? Enter yes or no.\n')
        if restart.lower() == "yes":
            continue
        elif restart.lower() == "no":
            print("Thank You")
            break
        

In [30]:
if __name__ == "__main__":
	main()


Processing data for: Chicago (chicago.csv)

Processing data for: New York City (new_york_city.csv)

Processing data for: Washington (washington.csv)
Hello! Let's explore some US bikeshare data!



Enter city name (chicago, new york city, washington):  chicago

Enter month (january, february, ....., june or 'all'):  janaury


Invalid input. Please enter a valid month.



Enter month (january, february, ....., june or 'all'):  june

Enter day of week (monday, tuesday, ...., sunday or 'all'):  all


----------------------------------------


  print(f"Gender Counts: {'No Data Available' if gender_count.empty else f'Male: {gender_count[0]}, Female: {gender_count[1]}'}")



Calculating The Most Frequent Times of Travel...

Code started executing at: 2025-03-20 15:36:44
Most Common Month        : June
Most Common Day of Week  : Friday
Most Common Start Hour   : 17hr 12min 38sec
Most Common End Hour     : 17hr 36min 41sec
Code took: -0.08 seconds
----------------------------------------

Calculating The Most Popular Stations and Trip...

1742465204.9017122
Most Common Start Station        : Streeter Dr & Grand Ave
Most Common End Station          : Streeter Dr & Grand Ave
Most Frequent Trip Combination   : Lake Shore Dr & Monroe St  TO  Streeter Dr & Grand Ave
Code took: -0.03 seconds
----------------------------------------

Calculating Trip Duration...

Total Travel Time     : 1188 days 16 hr 33 min
Average Travel Time   : 17 min
Minimum Travel Time   : 1 min
Maximum Travel Time   : 23 hr 57 min
Average Trip by Subscriber : 12 min
Average Trip by Customer   : 31 min

Calculating User Stats...

User Types: Subscriber: 72436, Customer: 25645
Gender Counts:


Would you like to see some Statistical plots for more Insights? Enter yes or no:
 no

Would you like to see rew data? Enter yes or no.
 no


Thank You



Would you like to restart? Enter yes or no.
 no


Thank You
