In [1]:
# Initial imports
import pandas as pd
from datetime import datetime
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# Loading data
file_path = Path("202111-citibike-tripdata-cleaned.csv")
new_bikedata_df = pd.read_csv(file_path)
new_bikedata_df.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,1,1,2021-11-08 07:34:45,2021-11-08 07:52:14,Franklin Ave & St Marks Ave,Carroll St & Smith St,40.675832,-73.956168,40.680611,-73.994758,member
1,2,1,2021-11-24 10:27:06,2021-11-24 10:34:40,Pleasant Ave & E 120 St,Willis Ave & E 137 St,40.797477,-73.931185,40.808384,-73.923604,member
2,3,1,2021-11-27 18:11:36,2021-11-27 19:19:16,West St & Liberty St,W 45 St & 8 Ave,40.711444,-74.014847,40.759291,-73.988597,member
3,4,1,2021-11-22 14:52:20,2021-11-22 15:30:40,W 67 St & Broadway,E 84 St & 1 Ave,40.774925,-73.982666,40.775655,-73.950686,member
4,5,1,2021-11-10 14:32:56,2021-11-10 14:36:19,W 53 St & 10 Ave,W 45 St & 8 Ave,40.766697,-73.990617,40.759291,-73.988597,member


## START PRE-PROCESSING OF DATA


In [3]:
#dt = '2015-05-13 23:53:00'
#date = dt.split()[0].replace('-','')
#time = dt.split()[1].replace(':','')
#fl = float(date+ '.' + time)

new_bikedata_df['started_at'] = pd.to_datetime(new_bikedata_df['started_at'])
new_bikedata_df['start_date'] = new_bikedata_df['started_at'].dt.date
new_bikedata_df['start_time'] = new_bikedata_df['started_at'].dt.time
new_bikedata_df['ended_at'] = pd.to_datetime(new_bikedata_df['ended_at'])
new_bikedata_df['end_date'] = new_bikedata_df['ended_at'].dt.date
new_bikedata_df['end_time'] = new_bikedata_df['ended_at'].dt.time

new_bikedata_df.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time
0,1,1,2021-11-08 07:34:45,2021-11-08 07:52:14,Franklin Ave & St Marks Ave,Carroll St & Smith St,40.675832,-73.956168,40.680611,-73.994758,member,2021-11-08,07:34:45,2021-11-08,07:52:14
1,2,1,2021-11-24 10:27:06,2021-11-24 10:34:40,Pleasant Ave & E 120 St,Willis Ave & E 137 St,40.797477,-73.931185,40.808384,-73.923604,member,2021-11-24,10:27:06,2021-11-24,10:34:40
2,3,1,2021-11-27 18:11:36,2021-11-27 19:19:16,West St & Liberty St,W 45 St & 8 Ave,40.711444,-74.014847,40.759291,-73.988597,member,2021-11-27,18:11:36,2021-11-27,19:19:16
3,4,1,2021-11-22 14:52:20,2021-11-22 15:30:40,W 67 St & Broadway,E 84 St & 1 Ave,40.774925,-73.982666,40.775655,-73.950686,member,2021-11-22,14:52:20,2021-11-22,15:30:40
4,5,1,2021-11-10 14:32:56,2021-11-10 14:36:19,W 53 St & 10 Ave,W 45 St & 8 Ave,40.766697,-73.990617,40.759291,-73.988597,member,2021-11-10,14:32:56,2021-11-10,14:36:19


In [4]:
new_bikedata_df.drop(["started_at", "ended_at", "start_station_name", "end_station_name"], inplace=True, axis=1)
new_bikedata_df.head()


Unnamed: 0,ride_id,rideable_type,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time
0,1,1,40.675832,-73.956168,40.680611,-73.994758,member,2021-11-08,07:34:45,2021-11-08,07:52:14
1,2,1,40.797477,-73.931185,40.808384,-73.923604,member,2021-11-24,10:27:06,2021-11-24,10:34:40
2,3,1,40.711444,-74.014847,40.759291,-73.988597,member,2021-11-27,18:11:36,2021-11-27,19:19:16
3,4,1,40.774925,-73.982666,40.775655,-73.950686,member,2021-11-22,14:52:20,2021-11-22,15:30:40
4,5,1,40.766697,-73.990617,40.759291,-73.988597,member,2021-11-10,14:32:56,2021-11-10,14:36:19


In [5]:
new_bikedata_df.dtypes


ride_id            int64
rideable_type      int64
start_lat        float64
start_lng        float64
end_lat          float64
end_lng          float64
member_casual     object
start_date        object
start_time        object
end_date          object
end_time          object
dtype: object

In [6]:
# CHANGE THE DATE FORMAT FOR start_date
new_bikedata_df['start_date'] = new_bikedata_df['start_date'].apply(lambda x: pd.Timestamp(x).strftime('%Y%m%d'))
new_bikedata_df.head()


Unnamed: 0,ride_id,rideable_type,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time
0,1,1,40.675832,-73.956168,40.680611,-73.994758,member,20211108,07:34:45,2021-11-08,07:52:14
1,2,1,40.797477,-73.931185,40.808384,-73.923604,member,20211124,10:27:06,2021-11-24,10:34:40
2,3,1,40.711444,-74.014847,40.759291,-73.988597,member,20211127,18:11:36,2021-11-27,19:19:16
3,4,1,40.774925,-73.982666,40.775655,-73.950686,member,20211122,14:52:20,2021-11-22,15:30:40
4,5,1,40.766697,-73.990617,40.759291,-73.988597,member,20211110,14:32:56,2021-11-10,14:36:19


In [7]:
# CHANGE THE DATE FORMAT FOR start_date
new_bikedata_df['end_date'] = new_bikedata_df['end_date'].apply(lambda x: pd.Timestamp(x).strftime('%Y%m%d'))
new_bikedata_df.head()


Unnamed: 0,ride_id,rideable_type,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time
0,1,1,40.675832,-73.956168,40.680611,-73.994758,member,20211108,07:34:45,20211108,07:52:14
1,2,1,40.797477,-73.931185,40.808384,-73.923604,member,20211124,10:27:06,20211124,10:34:40
2,3,1,40.711444,-74.014847,40.759291,-73.988597,member,20211127,18:11:36,20211127,19:19:16
3,4,1,40.774925,-73.982666,40.775655,-73.950686,member,20211122,14:52:20,20211122,15:30:40
4,5,1,40.766697,-73.990617,40.759291,-73.988597,member,20211110,14:32:56,20211110,14:36:19


In [8]:
# TRANSFORM STRING COLUMN
#def change_string(member_casual):
    #if member_casual == "member":
        #return 1
    #else:
        #return 0
    
#new_bikedata_df["member_casual"] = new_bikedata_df["member_casual"].apply(change_string)
#new_bikedata_df.head(20)


In [9]:
new_bikedata_df['start_time'] = new_bikedata_df['start_time'].astype(str).str.replace(':', ' ')
new_bikedata_df


Unnamed: 0,ride_id,rideable_type,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time
0,1,1,40.675832,-73.956168,40.680611,-73.994758,member,20211108,07 34 45,20211108,07:52:14
1,2,1,40.797477,-73.931185,40.808384,-73.923604,member,20211124,10 27 06,20211124,10:34:40
2,3,1,40.711444,-74.014847,40.759291,-73.988597,member,20211127,18 11 36,20211127,19:19:16
3,4,1,40.774925,-73.982666,40.775655,-73.950686,member,20211122,14 52 20,20211122,15:30:40
4,5,1,40.766697,-73.990617,40.759291,-73.988597,member,20211110,14 32 56,20211110,14:36:19
...,...,...,...,...,...,...,...,...,...,...,...
2151489,-11198,0,40.771361,-73.924615,40.776700,-73.927631,member,20211121,11 59 01,20211121,12:02:54
2151490,-11197,0,40.825125,-73.941616,40.808442,-73.945209,member,20211112,09 29 41,20211112,09:40:20
2151491,-11196,0,40.714852,-74.011223,40.754557,-73.965930,member,20211118,17 36 02,20211118,18:10:08
2151492,-11195,1,40.821111,-73.935971,40.808442,-73.945209,casual,20211113,12 06 53,20211113,12:14:42


In [10]:
new_bikedata_df['end_time'] = new_bikedata_df['end_time'].astype(str).str.replace(':', ' ')
new_bikedata_df


Unnamed: 0,ride_id,rideable_type,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time
0,1,1,40.675832,-73.956168,40.680611,-73.994758,member,20211108,07 34 45,20211108,07 52 14
1,2,1,40.797477,-73.931185,40.808384,-73.923604,member,20211124,10 27 06,20211124,10 34 40
2,3,1,40.711444,-74.014847,40.759291,-73.988597,member,20211127,18 11 36,20211127,19 19 16
3,4,1,40.774925,-73.982666,40.775655,-73.950686,member,20211122,14 52 20,20211122,15 30 40
4,5,1,40.766697,-73.990617,40.759291,-73.988597,member,20211110,14 32 56,20211110,14 36 19
...,...,...,...,...,...,...,...,...,...,...,...
2151489,-11198,0,40.771361,-73.924615,40.776700,-73.927631,member,20211121,11 59 01,20211121,12 02 54
2151490,-11197,0,40.825125,-73.941616,40.808442,-73.945209,member,20211112,09 29 41,20211112,09 40 20
2151491,-11196,0,40.714852,-74.011223,40.754557,-73.965930,member,20211118,17 36 02,20211118,18 10 08
2151492,-11195,1,40.821111,-73.935971,40.808442,-73.945209,casual,20211113,12 06 53,20211113,12 14 42


In [11]:
new_bikedata_df[['start_hours', 'start_minutes', 'start_seconds']] = new_bikedata_df['start_time'].str.split(' ', expand=True)
new_bikedata_df[['end_hours', 'end_minutes', 'end_seconds']] = new_bikedata_df['end_time'].str.split(' ', expand=True)


In [12]:
new_bikedata_df.drop(["start_time", "end_time"], inplace=True, axis=1)
new_bikedata_df.head()


Unnamed: 0,ride_id,rideable_type,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,end_date,start_hours,start_minutes,start_seconds,end_hours,end_minutes,end_seconds
0,1,1,40.675832,-73.956168,40.680611,-73.994758,member,20211108,20211108,7,34,45,7,52,14
1,2,1,40.797477,-73.931185,40.808384,-73.923604,member,20211124,20211124,10,27,6,10,34,40
2,3,1,40.711444,-74.014847,40.759291,-73.988597,member,20211127,20211127,18,11,36,19,19,16
3,4,1,40.774925,-73.982666,40.775655,-73.950686,member,20211122,20211122,14,52,20,15,30,40
4,5,1,40.766697,-73.990617,40.759291,-73.988597,member,20211110,20211110,14,32,56,14,36,19


## START ML MODEL PROCESS


In [14]:
# SPLIT THE DATA INTO TRAINING AND TESTING SETS
# CREATE OUR FEATURES
X = new_bikedata_df.drop("member_casual", axis=1)
X = pd.get_dummies(X)

# CREATE OUR TARGET
y = new_bikedata_df["rideable_type"]


In [15]:
X.describe()


Unnamed: 0,ride_id,rideable_type,start_lat,start_lng,end_lat,end_lng,start_date_20211101,start_date_20211102,start_date_20211103,start_date_20211104,...,end_seconds_50,end_seconds_51,end_seconds_52,end_seconds_53,end_seconds_54,end_seconds_55,end_seconds_56,end_seconds_57,end_seconds_58,end_seconds_59
count,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,...,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0,2151494.0
mean,28.61541,0.3345591,40.74078,-73.97591,40.7406,-73.97606,0.04154323,0.03835102,0.04038078,0.03973425,...,0.01672233,0.01663216,0.01659963,0.01676788,0.01657825,0.01662426,0.01681204,0.01667957,0.01665029,0.01675394
std,18962.02,0.4718362,0.03750914,0.02439358,0.03736021,0.02438703,0.199543,0.1920423,0.1968507,0.1953342,...,0.1282291,0.1278888,0.1277658,0.1284007,0.1276848,0.1278589,0.1285667,0.1280678,0.1279573,0.1283482
min,-32768.0,0.0,40.60402,-74.03571,40.60402,-74.06762,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-16469.0,0.0,40.7174,-73.99301,40.7174,-73.99316,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,169.0,0.0,40.74026,-73.98169,40.73989,-73.98169,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16468.0,1.0,40.76313,-73.96088,40.7627,-73.96088,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,32767.0,1.0,40.88226,-73.88145,40.88226,-73.88145,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
y.value_counts()


0    1431692
1     719802
Name: rideable_type, dtype: int64

In [19]:
from collections import Counter
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

print(Counter(y_train))
print(Counter(y_test))


Counter({0: 1073769, 1: 539851})
Counter({0: 357923, 1: 179951})


In [20]:
# NAIVE RANDOM OVERSAMPLING
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)


Counter({1: 1073769, 0: 1073769})

In [21]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [22]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score


1.0

In [27]:
# Display the confusion matrix

from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_test, y_pred)
cm1_df = pd.DataFrame(cm1, index=["ELECTRIC BIKE", "CLASSIC BIKE"], columns=["MEMBER", "CASUAL"])
cm1_df


Unnamed: 0,MEMBER,CASUAL
ELECTRIC BIKE,357923,0
CLASSIC BIKE,0,179951


In [28]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))


                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00    357923
          1       1.00      1.00      1.00      1.00      1.00      1.00    179951

avg / total       1.00      1.00      1.00      1.00      1.00      1.00    537874

