In [None]:
import pandas as pd

In [None]:
escooter_df = pd.read_csv('2022-2023_escooter_LimeLyftSpine.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset/iris.csv')

In [None]:
escooter_df.columns

Index(['Trip ID', 'Start Time', 'End Time', 'Vendor',
       'Start Community Area Number', 'End Community Area Number',
       'Start Community Area Name', 'End Community Area Name',
       'Start Centroid Latitude', 'Start Centroid Longitude',
       'Start Centroid Location', 'End Centroid Latitude',
       'End Centroid Longitude', 'End Centroid Location',
       'Trip Distance (miles)', 'Trip Duration (minutes)', 'hour_of_day',
       'season', 'trip_duration', 'trip_duration_minutes'],
      dtype='object')

In [None]:
escooter_df['Start Time'] = pd.to_datetime(escooter_df['Start Time'], format="mixed")
escooter_df['End Time'] = pd.to_datetime(escooter_df['End Time'], format="mixed")

In [None]:
start_date = escooter_df['Start Time'].min()
end_date = escooter_df['Start Time'].max()

print("Start Date:", start_date)
print("End Date:", end_date)

Start Date: 2022-05-10 09:00:00
End Date: 2023-09-30 19:00:00


In [None]:
#Group by day and count rows
df_day = escooter_df.groupby(escooter_df['Start Time'].dt.date).size().reset_index(name='count')

# Add the 'count' column to the original DataFrame
escooter_df['daily_ride_count'] = escooter_df['Start Time'].dt.date.map(df_day.set_index('Start Time').get('count'))

In [None]:
# Define function to map months to seasons
def get_season(month):
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Fall'
    else:
        return 'Winter'

# Extract month from timestamp and map to season
escooter_df['season'] = escooter_df['Start Time'].dt.month.map(get_season)

In [None]:
def get_day_of_week(timestamp):
    return timestamp.strftime("%A")

# Apply the function to each row in the DataFrame
escooter_df['day_of_week'] = escooter_df['Start Time'].apply(get_day_of_week)

In [None]:
escooter_df.columns

Index(['Trip ID', 'Start Time', 'End Time', 'Vendor',
       'Start Community Area Number', 'End Community Area Number',
       'Start Community Area Name', 'End Community Area Name',
       'Start Centroid Latitude', 'Start Centroid Longitude',
       'Start Centroid Location', 'End Centroid Latitude',
       'End Centroid Longitude', 'End Centroid Location',
       'Trip Distance (miles)', 'Trip Duration (minutes)', 'hour_of_day',
       'season', 'trip_duration', 'trip_duration_minutes', 'daily_ride_count',
       'day_of_week'],
      dtype='object')

In [None]:
escooter_df.to_csv('2022-2023_escooter_LimeLyftSpine_season_day_of_week.csv',index=False)

In [None]:
escooter_df['season'].value_counts()

season
Summer    403471
Fall      215384
Spring    106217
Winter     11432
Name: count, dtype: int64

In [None]:
independent_var_df = escooter_df.copy()
independent_var_df = independent_var_df[['Vendor','Trip Distance (miles)', 'Trip Duration (minutes)', 'hour_of_day', 'season', 'count','day_of_week']]

In [None]:
escooter_Summer = independent_var_df[independent_var_df['season'] == 'Summer']
escooter_Winter = independent_var_df[independent_var_df['season'] == 'Winter']
escooter_Fall = independent_var_df[independent_var_df['season'] == 'Fall']
escooter_Spring = independent_var_df[independent_var_df['season'] == 'Spring']

In [None]:
escooter_Summer.shape

(403471, 7)

In [None]:
escooter_Winter.shape

(11432, 7)

In [None]:
escooter_Fall.shape

(215384, 7)

In [None]:
escooter_Spring.shape

(106217, 7)

In [None]:
import matplotlib
matplotlib.use('Agg')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels
from statsmodels.genmod.families import Poisson
from statsmodels.tools.eval_measures import rmse

#from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
import warnings
warnings.filterwarnings("ignore")

statsmodels.__version__

'0.14.2'

In [None]:
escooter_df['Trip Distance (miles)'].max()

20.139838408949657

In [None]:
escooter_df['Trip Distance (miles)'].min()

0.0

In [None]:
# poisson regression
def poission_regression(df,season_type):
    # create a poisson regression model
    print("\--------------------------------------")
    print("\poisson regression with constant")
    print("\--------------------------------------")
    pd.set_option('display.max_columns', None)
    #df = pd.concat((df, pd.get_dummies(df['season'])), axis=1)
    df = pd.concat((df, pd.get_dummies(df['day_of_week'])), axis=1)
    #df = pd.concat((df, pd.get_dummies(df['Trip Distance (miles)'])), axis=1)
    df = pd.concat((df, pd.get_dummies(df['Vendor'])), axis=1)
    print(df.columns)

    y = df['count']
    x = df[['Lime', 'Lyft', 'Spin','Trip Distance (miles)','Trip Duration (minutes)', 'Friday', 'Monday', 'Saturday', 'Sunday','Thursday', 'Tuesday', 'Wednesday']]
    #x = df[['Lime','Link', 'Lyft', 'Spin','Trip Distance (miles)','Trip Duration (minutes)', 'Friday', 'Monday', 'Saturday', 'Sunday','Thursday', 'Tuesday', 'Wednesday']]
    x = sm.add_constant(x)
    x = x.astype(float)

    pm = sm.GLM(y, x, family=sm.families.Poisson()).fit()
    print(pm.summary().as_latex())
    print("Pm params values and coefficients for {}".format(season_type))
    coefficients = pm.params
    p_values = pm.pvalues
    for coef, p_val in zip(coefficients, p_values):
        if p_val < 0.001:
            significance = '***'
        elif p_val < 0.01:
            significance = '**'
        elif p_val < 0.05:
            significance = '*'
        else:
            significance = ''
        print(f"{coef}{significance} (p-value: {p_val})")
    # Display coefficient values
    print("Coefficient Values for {}:".format(season_type))
    print(coefficients)
    print("poisson regression's rmse value")
    print(sm.tools.eval_measures.rmse(y, pm.fittedvalues, axis=0))

    pr_predict(x, y, season_type)

In [None]:
# poisson regression prediction
def pr_predict(x, y, weather):
    #############################
    #here is for train/test ratio 80:20
    size = 0.2

    #train r2, rmse
    print("PR train r2 and rmse")
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = size)
    pm_train = sm.GLM(y_train, x_train, family=sm.families.Poisson()).fit()
    print(np.sqrt(metrics.mean_squared_error(y_train, pm_train.predict(x_train))))
    #test r2, rmse
    print("PR test r2 and rmse")
    pm_test = sm.Poisson(y_train, x_train).fit()
    y_pred = pm_test.predict(x_test)

    print("\n********************************")
    print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    df1 = df.copy()
    df1.sort_index(inplace=True)
    plt.plot(df1['Actual'], c="blue", label="actual", linewidth=2)
    plt.plot(df1['Predicted'], c="red", label="predicted", linewidth=2)
    plt.legend()
    plt.title('Daily Escooter usage  during {}'.format(weather))
    plt.savefig('escooter_prediction_{}.png'.format(weather))
    #plt.savefig("weather_duration_prediction_test.png")
    plt.show()
    plt.clf()
    plt.cla()
    plt.close()

In [None]:
poission_regression(escooter_Winter,"Winter")

\--------------------------------------
\poisson regression with constant
\--------------------------------------
Index(['Vendor', 'Trip Distance (miles)', 'Trip Duration (minutes)',
       'hour_of_day', 'season', 'count', 'day_of_week', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'Lime',
       'Lyft', 'Spin'],
      dtype='object')
\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}          &      count       & \textbf{  No. Observations:  } &     11432    \\
\textbf{Model:}                  &       GLM        & \textbf{  Df Residuals:      } &     11421    \\
\textbf{Model Family:}           &     Poisson      & \textbf{  Df Model:          } &        10    \\
\textbf{Link Function:}          &       Log        & \textbf{  Scale:             } &     1.0000   \\
\textbf{Method:}                 &       IRLS       & \textbf{  Log-Likelihood:    } & -4.1205e+05  \\
\textbf{Date:}                   & Mon, 22 Apr 2024 & \textbf

In [None]:
escooter_Summer.shape

(403471, 7)