In [1]:
import pandas as pd
import pickle

In [2]:
%%time
july = pd.read_csv('../data/july.csv')
july.head()

Wall time: 1min 9s


Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-07-01 00:00:33.550000,36.156678,-86.809004,Powered635135,Powered,22.0,scooter,0.15,Lyft
1,2019-07-01 00:00:34.973000,36.145674,-86.794138,Powered790946,Powered,33.0,scooter,0.15,Lyft
2,2019-07-01 00:00:41.183000,36.179319,-86.751538,Powered570380,Powered,76.0,scooter,0.15,Lyft
3,2019-07-01 00:00:41.620000,36.152111,-86.803821,Powered240631,Powered,43.0,scooter,0.15,Lyft
4,2019-07-01 00:00:45.087000,36.149355,-86.79755,Powered970404,Powered,52.0,scooter,0.15,Lyft


In [3]:
july.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25075445 entries, 0 to 25075444
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   pubdatetime  object 
 1   latitude     float64
 2   longitude    float64
 3   sumdid       object 
 4   sumdtype     object 
 5   chargelevel  float64
 6   sumdgroup    object 
 7   costpermin   float64
 8   companyname  object 
dtypes: float64(4), object(5)
memory usage: 1.7+ GB


#### convert the company name to an integer 
- find the unique company names
- assign each company an integer (you can use a dictionary for this step)
- update the `companyname` column to store the integer id for each company

In [4]:
july.companyname.unique()

array(['Lyft', 'Bird', 'Spin', 'Bolt', 'Jump', 'Lime', 'Gotcha'],
      dtype=object)

In [5]:
company_dict = {'Bird':0, 'Lyft': 1, 'Gotcha': 2, 'Lime': 3, 'Spin': 4, 'Jump': 5, 'Bolt': 6}

In [6]:
july.companyname = july.companyname.replace(company_dict)

#### next convert `pubdatetime` to a datetime 

In [7]:
july.pubdatetime = pd.to_datetime(july.pubdatetime)
july.head(2)

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-07-01 00:00:33.550,36.156678,-86.809004,Powered635135,Powered,22.0,scooter,0.15,1
1,2019-07-01 00:00:34.973,36.145674,-86.794138,Powered790946,Powered,33.0,scooter,0.15,1


#### Next remove unneeded data
#### keep just the scooters

In [8]:
july.sumdgroup.unique()

array(['scooter', 'Scooter', 'bicycle'], dtype=object)

In [9]:
july_scooters = july.loc[july.sumdgroup.isin(['scooter', 'Scooter'])]

#### keep just the columns we want to work with

In [10]:
july_scooters = july_scooters[['pubdatetime', 'latitude', 'longitude', 'sumdid', 'chargelevel', 'companyname']]

#### check `.info()` again

In [11]:
july_scooters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25066524 entries, 0 to 25075444
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   pubdatetime  datetime64[ns]
 1   latitude     float64       
 2   longitude    float64       
 3   sumdid       object        
 4   chargelevel  float64       
 5   companyname  int64         
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 1.3+ GB


#### The only object datatype remaining is sumdid (an alphanumeric unique identifier)
- time to pickle

In [12]:
july_scooters.to_pickle("../data/july.pkl")

In [13]:
%%time
july = pd.read_pickle("../data/july.pkl")

Wall time: 3.84 s


In [14]:
july.head(30)

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,chargelevel,companyname
0,2019-07-01 00:00:33.550,36.156678,-86.809004,Powered635135,22.0,1
1,2019-07-01 00:00:34.973,36.145674,-86.794138,Powered790946,33.0,1
2,2019-07-01 00:00:41.183,36.179319,-86.751538,Powered570380,76.0,1
3,2019-07-01 00:00:41.620,36.152111,-86.803821,Powered240631,43.0,1
4,2019-07-01 00:00:45.087,36.149355,-86.79755,Powered970404,52.0,1
5,2019-07-01 00:00:46.847,36.168153,-86.777374,Powered912425,80.0,1
6,2019-07-01 00:00:50.500,36.143266,-86.809886,Powered667919,60.0,1
7,2019-07-01 00:00:50.520,36.177355,-86.750667,Powered523737,85.0,1
8,2019-07-01 00:00:53.273,36.158546,-86.774062,Powered689152,72.0,1
9,2019-07-01 00:00:56.293,36.177708,-86.750043,Powered211807,99.0,1


In [15]:
july.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25066524 entries, 0 to 25075444
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   pubdatetime  datetime64[ns]
 1   latitude     float64       
 2   longitude    float64       
 3   sumdid       object        
 4   chargelevel  float64       
 5   companyname  int64         
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 1.3+ GB
