In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
path = 'uber.csv'
fares = pd.read_csv(path)

### Cleaning the Data

In [3]:
fares.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [4]:
fares.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


There are 200,000 entries, which is a large amount. We can begin cleaning by dropping all rows with null values. As you can see above this is only one row. I will also drop the "Unnamed" column because it contains a random number that isn't associated with the data

In [5]:
fares = fares.dropna()
fares = fares.drop('Unnamed: 0', axis = 1)

By looking at the distribution of the data we can see that some columns contain unattainable values, for instance a passenger count of 208 passengers. It is unclear where these values come from, but because we have so many entries we can simply remove abnormal values without worrying about the row count getting too low. I first cleaned the passenger count by removing values that were too high and values that were zero. 

In [6]:
fares.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0
mean,11.359892,-72.527631,39.935881,-72.525292,39.92389,1.684543
std,9.90176,11.437815,7.720558,13.117408,6.794829,1.385995
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [7]:
fares = fares.sort_values('passenger_count', ascending = False)
fares.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
113038,2010-12-28 08:20:00.0000001,11.7,2010-12-28 08:20:00 UTC,-73.937795,40.758498,-73.937835,40.758415,208
64289,2014-07-11 21:28:00.000000224,26.5,2014-07-11 21:28:00 UTC,-73.957727,40.773897,-73.917823,40.870972,6
32804,2012-12-13 11:13:00.000000192,19.5,2012-12-13 11:13:00 UTC,0.0,0.0,0.0,0.0,6
90775,2014-07-15 01:08:00.00000054,7.5,2014-07-15 01:08:00 UTC,-73.988027,40.743465,-73.973127,40.759802,6
108923,2014-07-29 12:03:00.000000106,11.5,2014-07-29 12:03:00 UTC,-73.96446,40.760325,-73.987433,40.738952,6


In [8]:
fares = fares.drop(113038, axis = 0)

In [9]:
fares.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199998.0,199998.0,199998.0,199998.0,199998.0,199998.0
mean,11.35989,-72.527624,39.935877,-72.525285,39.923886,1.683512
std,9.901785,11.437844,7.720578,13.11744,6.794846,1.306965
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734795,-73.991407,40.733823,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963659,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,6.0


In [10]:
fares = fares.sort_values('passenger_count', ascending = True)

In [11]:
fares.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
159475,2011-05-17 08:51:12.0000001,8.5,2011-05-17 08:51:12 UTC,-73.999757,40.74341,-74.004048,40.715975,0
196039,2012-01-01 04:07:16.0000002,3.3,2012-01-01 04:07:16 UTC,-73.9867,40.7369,-73.9821,40.7401,0
125348,2011-06-05 14:38:51.0000001,9.3,2011-06-05 14:38:51 UTC,-73.9961,40.7253,-73.9745,40.7367,0
8016,2011-07-31 00:04:10.0000002,4.5,2011-07-31 00:04:10 UTC,-73.9935,40.7571,-73.9835,40.7499,0
189220,2011-07-04 16:32:16.0000002,7.7,2011-07-04 16:32:16 UTC,-73.9986,40.7551,-73.972,40.7579,0


In [12]:
fares = fares.drop(fares.index[fares['passenger_count'] == 0], axis = 0)

In [13]:
fares.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199290.0,199290.0,199290.0,199290.0,199290.0,199290.0
mean,11.366713,-72.529899,39.937068,-72.527187,39.924829,1.689493
std,9.910588,11.434766,7.723776,13.121348,6.795558,1.30542
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,1.0
25%,6.0,-73.992065,40.734796,-73.99141,40.733826,1.0
50%,8.5,-73.981825,40.752585,-73.980095,40.753041,1.0
75%,12.5,-73.967158,40.767157,-73.963663,40.767995,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,6.0


Now we can inspect the fare amount column. In this column there were many values that seemed to be too low. I decided that any value less than \\$2.50 should probably be removed because it seems to low to represent an actual fare. There were also a number of negative values that I had to remove. Although these low and negative values might have been the result of a coupon being used or refund ebing issued, nothing else in the dataset denotes that. Therefore it is not beneficial to include these values in our model. There were also some very high values above the $300 mark, however I decided to include these because it could have been a very long drive in a more expensive uber vehicle. More can be learned about this scenario by looking at the pickup and dropoff locations. 

In [14]:
fares = fares.sort_values('fare_amount', ascending = True)

In [15]:
fares.head(50)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
98875,2015-03-20 21:31:14.0000007,-52.0,2015-03-20 21:31:14 UTC,-74.000359,40.728729,-74.005699,40.72868,5
111589,2015-02-23 19:26:44.0000004,-52.0,2015-02-23 19:26:44 UTC,-73.984665,40.759026,0.0,0.0,5
164056,2010-03-16 15:27:10.0000003,-50.5,2010-03-16 15:27:10 UTC,-73.784868,40.648677,-73.976975,40.763522,3
89322,2010-03-09 08:26:10.0000006,-49.57,2010-03-09 08:26:10 UTC,-73.972772,40.785657,-73.972867,40.7855,5
92063,2010-03-12 08:49:10.0000002,-23.7,2010-03-12 08:49:10 UTC,-73.95274,40.768233,-74.007028,40.707338,3
151681,2010-02-17 05:46:10.0000001,-10.9,2010-02-17 05:46:10 UTC,-73.964257,40.76063,-73.994222,40.761533,1
104080,2010-03-09 20:25:10.0000002,-7.3,2010-03-09 20:25:10 UTC,-73.952623,40.766942,-73.953787,40.784882,1
139272,2010-02-23 18:52:10.0000003,-6.9,2010-02-23 18:52:10 UTC,-73.988727,40.736445,-73.977622,40.750487,1
148803,2010-02-12 10:50:10.0000005,-5.7,2010-02-12 10:50:10 UTC,-73.9773,40.742783,-73.988837,40.747465,3
190925,2015-05-01 23:25:34.0000002,-5.5,2015-05-01 23:25:34 UTC,-73.988701,40.742649,-73.985924,40.735001,1


In [16]:
fares = fares.drop(fares.index[fares['fare_amount'] < 2.50], axis = 0)

In [17]:
fares.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
139122,2012-05-18 12:18:00.00000055,2.5,2012-05-18 12:18:00 UTC,-73.97304,40.762425,-73.972348,40.762713,3
183402,2010-09-17 00:48:55.0000001,2.5,2010-09-17 00:48:55 UTC,-73.991858,40.735258,-73.97877,40.73675,1
56914,2013-07-26 23:39:32.0000004,2.5,2013-07-26 23:39:32 UTC,-74.000152,40.730641,-73.983829,40.72782,1
68405,2011-10-14 15:49:00.000000138,2.5,2011-10-14 15:49:00 UTC,0.0,0.0,0.0,0.0,1
43621,2009-07-17 08:02:32.0000004,2.5,2009-07-17 08:02:32 UTC,-73.953634,40.784856,-74.011458,40.702466,1


In [18]:
fares.tail()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
197493,2014-09-07 08:39:00.00000012,230.0,2014-09-07 08:39:00 UTC,-73.937765,40.758267,-74.3822,40.70089,2
71715,2013-05-11 00:31:00.00000084,250.0,2013-05-11 00:31:00 UTC,0.0,0.0,0.0,0.0,1
185325,2013-04-19 00:00:00.000000235,275.0,2013-04-19 00:00:00 UTC,0.0,0.0,0.0,0.0,1
4292,2014-10-24 20:20:00.000000181,350.0,2014-10-24 20:20:00 UTC,0.0,0.0,0.0,0.0,3
170081,2011-04-10 04:10:00.00000064,499.0,2011-04-10 04:10:00 UTC,-73.968377,40.764602,-73.968368,40.7646,1


Now we can look at the columns regarding pickup and dropoff location. The longitudes should be inbetween -180 and 180, and the latitudes should be inbetween -90 and 90. I made sure to drop all rows that do not fit into these boundaries. I also drop all rows where the pickup and dropoff longitude is the same if the pickup and dropoff latitude is also the same. 

In [19]:
fares = fares.sort_values('pickup_longitude', ascending = True)

In [20]:
fares = fares.drop(fares.index[fares['pickup_longitude'] < -180], axis = 0)
fares = fares.drop(fares.index[fares['pickup_longitude'] > 180], axis = 0)
fares = fares.drop(fares.index[fares['dropoff_longitude'] < -180], axis = 0)
fares = fares.drop(fares.index[fares['dropoff_longitude'] > 180], axis = 0)

fares = fares.drop(fares.index[fares['pickup_latitude'] < -90], axis = 0)
fares = fares.drop(fares.index[fares['pickup_latitude'] > 90], axis = 0)
fares = fares.drop(fares.index[fares['dropoff_latitude'] < -90], axis = 0)
fares = fares.drop(fares.index[fares['dropoff_latitude'] > 90], axis = 0)

In [21]:
fares = fares.drop(fares[((fares['pickup_longitude'] == fares['dropoff_longitude'])
                          & (fares['pickup_latitude'] == fares['dropoff_latitude']))].index)

In [22]:
fares.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,193641.0,193641.0,193641.0,193641.0,193641.0,193641.0
mean,11.363358,-73.827243,40.647211,-73.837775,40.651648,1.689792
std,9.757198,3.654969,2.931863,3.534136,2.901711,1.3056
min,2.5,-93.824668,-74.015515,-75.458979,-74.01575,1.0
25%,6.0,-73.99227,40.736345,-73.991592,40.735215,1.0
50%,8.5,-73.982115,40.753241,-73.980538,40.753704,1.0
75%,12.5,-73.968395,40.767505,-73.96541,40.768307,2.0
max,499.0,40.808425,48.01876,40.831932,45.031598,6.0


In [23]:
fares = fares.sort_values('fare_amount', ascending = False)

We can also drop the key column because it contains the same information that is included in the datetime column. Its duplicate data.

In [24]:
fares = fares.drop('key', axis = 1)

In order to allow this model to be a classifier, I'm going to fit its target values into buckets: cheap, affordable, expensive, and luxurious. Then I'll use an ordinal encoder to encode this as an integer. 

In [25]:
fares['Price_Range'] = fares['fare_amount']
fares.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Price_Range
170081,499.0,2011-04-10 04:10:00 UTC,-73.968377,40.764602,-73.968368,40.7646,1,499.0
197493,230.0,2014-09-07 08:39:00 UTC,-73.937765,40.758267,-74.3822,40.70089,2,230.0
29261,220.0,2013-09-27 12:36:11 UTC,-73.801147,40.671653,-73.790402,40.646742,1,220.0
23682,206.38,2014-11-02 22:40:20 UTC,-73.782161,40.644601,-74.449079,40.836457,1,206.38
184901,196.0,2009-09-17 09:38:03 UTC,-74.39731,40.557112,-74.397303,40.557102,1,196.0


In [26]:
def custom_transformer():
    custom_labels = ['cheap', 'affordable', 'expensive', 'luxurious']
    custom_bins = [0, 10, 30, 75, 500]
    fares['Price_Range'] = pd.cut(fares['Price_Range'], bins=custom_bins, labels=custom_labels)
    
custom_transformer()

In [27]:
fares.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Price_Range
170081,499.0,2011-04-10 04:10:00 UTC,-73.968377,40.764602,-73.968368,40.7646,1,luxurious
197493,230.0,2014-09-07 08:39:00 UTC,-73.937765,40.758267,-74.3822,40.70089,2,luxurious
29261,220.0,2013-09-27 12:36:11 UTC,-73.801147,40.671653,-73.790402,40.646742,1,luxurious
23682,206.38,2014-11-02 22:40:20 UTC,-73.782161,40.644601,-74.449079,40.836457,1,luxurious
184901,196.0,2009-09-17 09:38:03 UTC,-74.39731,40.557112,-74.397303,40.557102,1,luxurious


In [28]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories = [['cheap', 'affordable', 'expensive', 'luxurious']])
price_encoded = ordinal_encoder.fit_transform(fares[['Price_Range']])
price_encoded[:10]

array([[3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.]])

In [29]:
ordinal_encoder.categories_

[array(['cheap', 'affordable', 'expensive', 'luxurious'], dtype=object)]

In [30]:
fares['price_encoded'] = price_encoded

In [31]:
fares.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Price_Range,price_encoded
170081,499.0,2011-04-10 04:10:00 UTC,-73.968377,40.764602,-73.968368,40.7646,1,luxurious,3.0
197493,230.0,2014-09-07 08:39:00 UTC,-73.937765,40.758267,-74.3822,40.70089,2,luxurious,3.0
29261,220.0,2013-09-27 12:36:11 UTC,-73.801147,40.671653,-73.790402,40.646742,1,luxurious,3.0
23682,206.38,2014-11-02 22:40:20 UTC,-73.782161,40.644601,-74.449079,40.836457,1,luxurious,3.0
184901,196.0,2009-09-17 09:38:03 UTC,-74.39731,40.557112,-74.397303,40.557102,1,luxurious,3.0


My model requires all my data to be numerical. This means I have to somehow change the pickup_datetime column to be numerical. I will parse out the year, month, date, and time into separate columns. This fixes all the columns except time, which would still be an object datatype. To fix this I parse out the colon and then fit the data into buckets: Late Night, Early Morning, Morning, Afternoon, Evening, Night. Then I ordinal encode these categories to a numerical value and drop the original datetime column. I also drop the other categorical columns that have been encoded.

In [32]:
fares['year'] = fares['pickup_datetime'].str[:4].astype(int)
fares['month'] = fares['pickup_datetime'].str[5:7].astype(int)
fares['day'] = fares['pickup_datetime'].str[8:10].astype(int)
fares['time'] = fares['pickup_datetime'].str[11:16]
fares['time'] = [c.replace(':', '') for c in fares['time']]
fares['time'] = fares['time'].astype(int)
fares['new_time'] = pd.cut(x=fares['time'], bins=[0, 400, 800, 1200, 1600, 2000, 2359],
                    labels=['Late Night', 'Early Morning', 'Morning',
                            'Afternoon', 'Evening', 'Night']) 
fares = fares.dropna()

In [33]:
fares.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Price_Range,price_encoded,year,month,day,time,new_time
170081,499.0,2011-04-10 04:10:00 UTC,-73.968377,40.764602,-73.968368,40.7646,1,luxurious,3.0,2011,4,10,410,Early Morning
197493,230.0,2014-09-07 08:39:00 UTC,-73.937765,40.758267,-74.3822,40.70089,2,luxurious,3.0,2014,9,7,839,Morning
29261,220.0,2013-09-27 12:36:11 UTC,-73.801147,40.671653,-73.790402,40.646742,1,luxurious,3.0,2013,9,27,1236,Afternoon
23682,206.38,2014-11-02 22:40:20 UTC,-73.782161,40.644601,-74.449079,40.836457,1,luxurious,3.0,2014,11,2,2240,Night
184901,196.0,2009-09-17 09:38:03 UTC,-74.39731,40.557112,-74.397303,40.557102,1,luxurious,3.0,2009,9,17,938,Morning


In [34]:
ordinal_encoder = OrdinalEncoder(categories = [['Late Night', 'Early Morning', 'Morning',
                            'Afternoon', 'Evening', 'Night']])
time_encoded = ordinal_encoder.fit_transform(fares[['new_time']])
time_encoded[:10]

array([[1.],
       [2.],
       [3.],
       [5.],
       [2.],
       [5.],
       [5.],
       [2.],
       [5.],
       [4.]])

In [35]:
fares['time'] = fares['new_time']
fares['new_time'] = time_encoded
fares.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,Price_Range,price_encoded,year,month,day,time,new_time
170081,499.0,2011-04-10 04:10:00 UTC,-73.968377,40.764602,-73.968368,40.7646,1,luxurious,3.0,2011,4,10,Early Morning,1.0
197493,230.0,2014-09-07 08:39:00 UTC,-73.937765,40.758267,-74.3822,40.70089,2,luxurious,3.0,2014,9,7,Morning,2.0
29261,220.0,2013-09-27 12:36:11 UTC,-73.801147,40.671653,-73.790402,40.646742,1,luxurious,3.0,2013,9,27,Afternoon,3.0
23682,206.38,2014-11-02 22:40:20 UTC,-73.782161,40.644601,-74.449079,40.836457,1,luxurious,3.0,2014,11,2,Night,5.0
184901,196.0,2009-09-17 09:38:03 UTC,-74.39731,40.557112,-74.397303,40.557102,1,luxurious,3.0,2009,9,17,Morning,2.0


In [36]:
fares = fares.drop('pickup_datetime', axis = 1)

In [37]:
fares.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193488 entries, 170081 to 185171
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   fare_amount        193488 non-null  float64 
 1   pickup_longitude   193488 non-null  float64 
 2   pickup_latitude    193488 non-null  float64 
 3   dropoff_longitude  193488 non-null  float64 
 4   dropoff_latitude   193488 non-null  float64 
 5   passenger_count    193488 non-null  int64   
 6   Price_Range        193488 non-null  category
 7   price_encoded      193488 non-null  float64 
 8   year               193488 non-null  int64   
 9   month              193488 non-null  int64   
 10  day                193488 non-null  int64   
 11  time               193488 non-null  category
 12  new_time           193488 non-null  float64 
dtypes: category(2), float64(7), int64(4)
memory usage: 18.1 MB


In [38]:
fare_data = fares.copy()
fare_data = fare_data.drop('Price_Range', axis = 1)
fare_data = fare_data.drop('fare_amount', axis = 1)
fare_data = fare_data.drop('time', axis = 1)

In [39]:
fare_data.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,price_encoded,year,month,day,new_time
170081,-73.968377,40.764602,-73.968368,40.7646,1,3.0,2011,4,10,1.0
197493,-73.937765,40.758267,-74.3822,40.70089,2,3.0,2014,9,7,2.0
29261,-73.801147,40.671653,-73.790402,40.646742,1,3.0,2013,9,27,3.0
23682,-73.782161,40.644601,-74.449079,40.836457,1,3.0,2014,11,2,5.0
184901,-74.39731,40.557112,-74.397303,40.557102,1,3.0,2009,9,17,2.0


Now the data is ready to be used in the model. I separate the attributes used for prediction and the target values. Then I standard scale the attributes used for prediction. Then I fit a stochastic gradient descent model to the data. Finally I use stratified k-fold cross validation with 10 folds to test my models predictions. 

In [40]:
fare_y = fare_data['price_encoded']
fare_X = fare_data.drop('price_encoded', axis = 1)

In [41]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(fare_X),columns = fare_X.columns)

In [42]:
df_scaled.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,new_time
0,-0.038631,0.040051,-0.036968,0.038936,-0.52833,-0.401843,-0.663546,-0.656611,-1.245958
1,-0.030259,0.037891,-0.154017,0.016989,0.237508,1.208692,0.789867,-1.001944,-0.619917
2,0.007105,0.008361,0.013369,-0.001665,-0.52833,0.671847,0.789867,1.300275,0.006125
3,0.012297,-0.000863,-0.172934,0.06369,-0.52833,1.208692,1.371233,-1.577498,1.258208
4,-0.155941,-0.030692,-0.158289,-0.032545,-0.52833,-1.475533,0.789867,0.149166,-0.619917


In [43]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(df_scaled, fare_y)

SGDClassifier()

In [44]:
fare_data.shape, fare_y.shape

((193488, 10), (193488,))

In [45]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
skf.get_n_splits(fare_data,fare_y)

accuracy = []

  
for train_index, test_index in skf.split(df_scaled, fare_y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_scaled.iloc[train_index], df_scaled.iloc[test_index]
    y_train, y_test = fare_y.iloc[train_index], fare_y.iloc[test_index]
    
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    score = accuracy_score(prediction, y_test)
    accuracy.append(score)
    
print(accuracy)

TRAIN: [     0      1      2 ... 193485 193486 193487] TEST: [    10     28     33 ... 193416 193417 193460]
TRAIN: [     1      2      3 ... 193485 193486 193487] TEST: [     0      6     16 ... 193458 193478 193483]
TRAIN: [     0      1      2 ... 193485 193486 193487] TEST: [     3     18     37 ... 193473 193475 193477]
TRAIN: [     0      1      2 ... 193484 193486 193487] TEST: [     9     34     35 ... 193466 193480 193485]
TRAIN: [     0      1      3 ... 193484 193485 193487] TEST: [     2      8     11 ... 193481 193482 193486]
TRAIN: [     0      1      2 ... 193485 193486 193487] TEST: [     7     48     52 ... 193443 193464 193469]
TRAIN: [     0      1      2 ... 193485 193486 193487] TEST: [    14     20     25 ... 193467 193472 193479]
TRAIN: [     0      2      3 ... 193485 193486 193487] TEST: [     1      5     15 ... 193454 193455 193471]
TRAIN: [     0      1      2 ... 193485 193486 193487] TEST: [     4     12     42 ... 193450 193456 193474]
TRAIN: [     0     

In [46]:
np.array(accuracy).mean()

0.6233409832405774

We get an accuracy of 62% which is pretty good. A model that classifies each entry randomly would have an approximate accuracy of 25% because there are four possible labels. This model performs 37% better than a random model. 