# David Walker - COS 710 Assignment 1, Initial Setup

**Imports**

In [56]:
import pandas as pd
import numpy as np
import math

**Parameters**

In [57]:
# Proportion of dataset to be used for training - utility value, used only for calculation purposes.
training_proportion = 0.7

**CSV Import**

In [58]:
df = pd.read_csv(r'For_modeling.csv', index_col=0)
print(df.columns)

Index(['Duration', 'Distance', 'PLong', 'PLatd', 'DLong', 'DLatd', 'Haversine',
       'Pmonth', 'Pday', 'Phour', 'Pmin', 'PDweek', 'Dmonth', 'Dday', 'Dhour',
       'Dmin', 'DDweek', 'Temp', 'Precip', 'Wind', 'Humid', 'Solar', 'Snow',
       'GroundTemp', 'Dust'],
      dtype='object')


In [59]:
df.head(1)

Unnamed: 0,Duration,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,Phour,...,Dmin,DDweek,Temp,Precip,Wind,Humid,Solar,Snow,GroundTemp,Dust
0,3,50,37.544666,126.888359,37.544666,126.888359,0.0,1,1,0,...,4,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0


In [60]:
df.loc[:4, 'Pmin']

0    0
1    0
2    0
3    1
4    1
Name: Pmin, dtype: int64

**Data exploration**

In [61]:
# Check the length of the dataframe.
count_row = df.shape[0]
print('Row count: ' + str(count_row))

# For reference, check how many rows will be used for testing and training
training_row_count = math.ceil(training_proportion * count_row)
print('Training row count: ' + str(training_row_count))

testing_row_count = math.floor((1 - training_proportion) * count_row)
print('Testing row count: ' + str(testing_row_count))

# Accounting to make sure no lines are duplicated
accounting_rows = training_row_count + testing_row_count
accounting_difference = count_row - accounting_rows
print('Balance of rows (should be 0): ' + str(accounting_difference))

Row count: 9601139
Training row count: 6720798
Testing row count: 2880341
Balance of rows (should be 0): 0


In [62]:
print(df.columns)

Index(['Duration', 'Distance', 'PLong', 'PLatd', 'DLong', 'DLatd', 'Haversine',
       'Pmonth', 'Pday', 'Phour', 'Pmin', 'PDweek', 'Dmonth', 'Dday', 'Dhour',
       'Dmin', 'DDweek', 'Temp', 'Precip', 'Wind', 'Humid', 'Solar', 'Snow',
       'GroundTemp', 'Dust'],
      dtype='object')


In [63]:
# Rename latitude and longitude features
df.rename({'PLong': 'Pickup_longitude', 'PLatd': 'Pickup_latitude', 'DLong': 'Dropoff_latitude', 'DLatd': 'Dropoff_latitude'}, axis=1, inplace=True)

In [64]:
# Rename pickup time features
df.rename({'PMonth': 'Pickup_month', 'Pday': 'Pickup_day', 'Phour': 'Pickup_hour', 'Pmin': 'Pickup_minute'},axis=1,inplace=True)

In [65]:
# Rename dropoff time features
df.rename({'Dmonth': 'Dropoff_month', 'Dday': 'Dropoff_day', 'Dhour': 'Dropoff_hour', 'Dmin': 'Dropoff_minute'},axis=1,inplace=True)

In [66]:
# Rename "day of week" features
df.rename({'PDweek': 'Pickup_weekday', 'DDweek': 'Dropoff_weekday'},axis=1,inplace=True)

In [67]:
print(df.columns)

Index(['Duration', 'Distance', 'Pickup_longitude', 'Pickup_latitude',
       'Dropoff_latitude', 'Dropoff_latitude', 'Haversine', 'Pmonth',
       'Pickup_day', 'Pickup_hour', 'Pickup_minute', 'Pickup_weekday',
       'Dropoff_month', 'Dropoff_day', 'Dropoff_hour', 'Dropoff_minute',
       'Dropoff_weekday', 'Temp', 'Precip', 'Wind', 'Humid', 'Solar', 'Snow',
       'GroundTemp', 'Dust'],
      dtype='object')


In [71]:
print(df.head(4))

   Duration  Distance  Pickup_longitude  Pickup_latitude  Dropoff_latitude  \
0         3        50         37.544666       126.888359         37.544666   
1        24      7670         37.506199       127.003944         37.551250   
2         8      1390         37.544590       127.057083         37.537014   
3         8      1820         37.571102       127.023560         37.561447   

   Dropoff_latitude  Haversine  Pmonth  Pickup_day  Pickup_hour  ...  \
0        126.888359   0.000000       1           1            0  ...   
1        127.035103   5.713529       1           1            0  ...   
2        127.061096   0.913702       1           1            0  ...   
3        127.034920   1.468027       1           1            0  ...   

   Dropoff_minute  Dropoff_weekday  Temp  Precip  Wind  Humid  Solar  Snow  \
0               4                0  -3.2     0.0   0.5   40.0    0.0   0.0   
1              25                0  -3.2     0.0   0.5   40.0    0.0   0.0   
2             

In [68]:
# Create random state to be used in splitting - seed value as parameter
rng = np.random.RandomState(1)

# Split training set and testing set into two seperate dataframes
train_df = df.sample(frac=training_proportion, random_state=rng)
test_df = df.loc[~df.index.isin(train_df.index)]

In [69]:
# Check length of each dataframe
train_df_row_count = train_df.shape[0]
print('Training dataframe row count: ' + str(train_df_row_count))

test_df_row_count = test_df.shape[0]
print('Testing dataframe row count: ' + str(test_df_row_count))

Training dataframe row count: 6720797
Testing dataframe row count: 2880342


In [70]:
# Export newly-made training and testing datasets to CSV for easy future work (and ease RAM usage!)
train_df.to_csv('training.csv', index = False)
test_df.to_csv('testing.csv', index = False)

**A new notebook will be created from here, now that the datasets are set up.**