In [74]:
import pandas as pd
import sklearn as sk
import geopy.distance
from sklearn.cluster import KMeans
import numpy as np

In [3]:
trips_df = pd.read_csv("citibike-trips.csv")

In [4]:
trips_df.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,customer_plan
0,2319,2016-03-09 13:08:21,2016-03-09 13:47:01,520,W 52 St & 5 Ave,40.759923,-73.976485,363,West Thames St,40.708347,-74.017134,23062,Subscriber,1972.0,male,
1,313,2015-07-09 15:42:30,2015-07-09 15:47:44,520,W 52 St & 5 Ave,40.759923,-73.976485,493,W 45 St & 6 Ave,40.7568,-73.982912,16909,Subscriber,1968.0,female,
2,798,2017-04-20 18:43:59,2017-04-20 18:57:17,520,W 52 St & 5 Ave,40.759923,-73.976485,3258,W 27 St & 10 Ave,40.750182,-74.002184,15555,Subscriber,1991.0,male,
3,3100,2017-04-23 15:23:46,2017-04-23 16:15:26,520,W 52 St & 5 Ave,40.759923,-73.976485,281,Grand Army Plaza & Central Park S,40.764397,-73.973715,20648,Customer,,unknown,
4,906,2016-01-11 18:32:33,2016-01-11 18:47:39,520,W 52 St & 5 Ave,40.759923,-73.976485,3162,W 78 St & Broadway,40.7834,-73.980931,15614,Subscriber,1961.0,male,


In [5]:
trips_df["birth_year"].isna().value_counts() ## 55K trips have users with unknown birth year 

False    417581
True      55975
Name: birth_year, dtype: int64

In [6]:
trips_df["gender"].value_counts() ## 56K users with unknown gender 

male       315807
female     100871
unknown     56878
Name: gender, dtype: int64

In [7]:
trips_df["customer_plan"].isna().value_counts() ## idk what this attribute is

True    473556
Name: customer_plan, dtype: int64

## Cleaning the Trips Data
TODO: Transform all columns into numeric data that's useful for classification

In [39]:
# idea for starttime, stoptime, is to see maybe WHEN most rides are initiated

In [67]:
hour_start = []
for i in range(len(trips_df["starttime"])):
    start_hour = trips_df["starttime"].iloc[i][11:13]
    if start_hour[0] == "0":
        hour_start.append(start_hour[1])
    else:
        hour_start.append(start_hour)

In [43]:
trips_df["hour_start"] = hour_start # we've now made an ordinal variable so we can classify the time of day when a trip starts

In [46]:
# idea for start station, end station, is to calculate distance of trip
trip_dist = []

for i in range(len(trips_df)):
    start_long = trips_df["start_station_longitude"].iloc[i]
    start_lat = trips_df["start_station_latitude"].iloc[i]
    end_long = trips_df["end_station_longitude"].iloc[i]
    end_lat = trips_df["end_station_latitude"].iloc[i]
    
    trip_dist.append(geopy.distance.geodesic((start_lat, start_long), (end_lat, end_long)).km) # using kilometers

In [49]:
trips_df["distance_traveled"] = trip_dist

In [55]:
# idea for usertype, one-hot encode whether or not the user is a subscriber
# 1 for yes, 0 for no.
# subscriber means that the user has an annual pass to use Citibike
# customer means that the user purchaed a 24- or 3-hour pass

is_subscriber = []

for i in range(len(trips_df)):
    if trips_df["usertype"].iloc[i] == "Subscriber":
        is_subscriber.append(1)
    else:
        is_subscriber.append(0)

In [56]:
trips_df["is_subscriber"] = is_subscriber

In [60]:
# problem with avg birth year is that there are many null values
# for now, assume that the users w/ no birth year (likely because they aren't subscribers)
# have the avg birth year of the users who do have their birth years recorded

In [59]:
avg_birth_year = trips_df["birth_year"].mean()

In [62]:
trips_df["birth_year"] = trips_df["birth_year"].fillna(avg_birth_year)

In [64]:
# for gender, one-hot encode male/female binary
# 1 for female, 0 for male
# my implementation of this assumes that gender-unknown users are male, though they are likely
# some other gender or have some % of female

is_female = []
for i in range(len(trips_df)):
    if trips_df["gender"].iloc[i] == "female":
        is_female.append(1)
    else:
        is_female.append(0)

In [65]:
trips_df["is_female"] = is_female

In [75]:
trips_df_cleaned = trips_df[["tripduration", "birth_year", "hour_start", "distance_traveled", "is_subscriber", "is_female"]] # just the numerical variables

In [87]:
trips_df_cleaned

Unnamed: 0,tripduration,birth_year,hour_start,distance_traveled,is_subscriber,is_female
0,2319,1972.000000,13,6.677860,1,0
1,313,1968.000000,15,0.643981,1,1
2,798,1991.000000,18,2.424841,1,0
3,3100,1977.377694,15,0.549202,0,0
4,906,1961.000000,18,2.634014,1,0
...,...,...,...,...,...,...
473551,1368,1950.000000,16,1.743683,1,1
473552,1283,1977.377694,12,3.642224,0,0
473553,620,1977.377694,15,1.782790,0,0
473554,1510,1977.377694,15,3.260764,0,0


In [98]:
hour_start_int = pd.to_numeric(trips_df_cleaned["hour_start"])
trips_df_cleaned["hour_start"] = hour_start_int

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trips_df_cleaned["hour_start"] = hour_start_int


## trying some clustering techniques

In [115]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(trips_df_cleaned)

In [116]:
kmeans.cluster_centers_

array([[8.87560044e+02, 1.97737735e+03, 1.39651397e+01, 1.79584482e+00,
        8.79511711e-01, 2.13003443e-01],
       [2.95199254e+05, 1.97974686e+03, 1.41186441e+01, 3.12645310e+00,
        4.74576271e-01, 2.71186441e-01],
       [1.35405686e+06, 1.98093011e+03, 1.50000000e+01, 3.11877386e+00,
        2.85714286e-01, 0.00000000e+00]])