In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from multiprocessing.pool import ThreadPool
from sklearn.metrics import mean_squared_error
import time
from collections import Counter


pool = ThreadPool(processes=1)

sns.set()

data = pd.read_csv('train.csv',nrows = 10000000)       # Contains the first 10 million data from                                                                       
    
training_set = data.sample(n = 1000000 , replace = False)    # 1 million random data from same distribution
dev_set = data.sample(n = 10000 , replace = False)           # 10000 random data from same distribution
test_set = pd.read_csv('test.csv')

# Shape, data type, data sample and descriptive statistics of Training Set

### Shape

In [None]:
training_set.shape

### Data Type

In [None]:
training_set.dtypes

### Data Sample

In [None]:
training_set.head()

### Descriptive statistics 

In [None]:
training_set.describe()

# Shape, data type, data sample and descriptive statistics of Development Set

### Shape

In [None]:
dev_set.shape

### Data Type

In [None]:
dev_set.dtypes

### Data Sample

In [None]:
dev_set.head()

### Descriptive statistics 

In [None]:
dev_set.describe()

# Shape, data type, data sample and descriptive statistics of Test Set



### Shape

In [None]:
test_set.shape

### Data Type

In [None]:
test_set.dtypes

### Data Sample

In [None]:
test_set.head()

### Descriptive statistics 

In [None]:
test_set.describe()

# Data Cleansing

## Removing Null values from the datasets

In [None]:
training_set.isnull().sum().sort_values(ascending = False)

In [None]:
dev_set.isnull().sum().sort_values(ascending = False)

In [None]:
test_set.isnull().sum().sort_values(ascending = False)

In [None]:
training_set = training_set.drop(training_set[training_set.isnull().any(1)].index , axis = 0)
training_set.shape

In [None]:
dev_set = dev_set.drop(dev_set[dev_set.isnull().any(1)].index , axis = 0)
dev_set.shape

In [None]:
test_set = test_set.drop(test_set[test_set.isnull().any(1)].index , axis = 0)
test_set.shape

## Removing rows from the dataset having invalid data 

In [None]:
training_set.describe()

In [None]:
training_set['fare_amount'].describe()

### Training_set is having negative value fare so we will remove those rows

In [None]:
Counter(training_set['fare_amount']<=0)

In [None]:
training_set = training_set.drop(training_set[training_set['fare_amount']<=0].index, axis=0)
training_set.shape

In [None]:
training_set['fare_amount'].describe()

### Passenger count is greater than  7

In [None]:
training_set['passenger_count'].describe()

In [None]:
#len(training_set[training_set['passenger_count']>7])

In [None]:
#### Since we are restricting to yellow cabs or medallion taxi. So we are considering 
#### a maximum of 7 passengers can operate in a taxi at a time.
Counter(training_set['passenger_count']>8)

In [None]:
training_set = training_set.drop(training_set[training_set['passenger_count']>8].index, axis = 0)
training_set.describe()

## Removing fare amount less than $2.5 and greater than $$500

In [None]:
print('\033[1m'+"Fare amount < 2.5: \n",training_set['fare_amount'][(training_set.fare_amount<2.5)].count())

In [None]:
print('\033[1m'+"Fare amount > 500: \n",training_set['fare_amount'][(training_set.fare_amount>500)].count())

In [None]:
training_set = training_set.drop(training_set[training_set['fare_amount'] < 2.5].index, axis = 0)
training_set = training_set.drop(training_set[training_set['fare_amount'] > 500].index, axis = 0)
training_set.shape

In [None]:
plt.figure(figsize=(5,5))
plt.hist(training_set['passenger_count'], bins=15)
plt.xlabel('No. of Passengers')
plt.ylabel('Frequency')

In [None]:
plt.scatter(x=training_set['passenger_count'], y=training_set['fare_amount'], c ='r' , marker = 'x')
plt.xlabel('No. of Passengers')
plt.ylabel('Fare')

### Removing invalid Longitude and Latitude 

In [None]:
training_set[['pickup_longitude','pickup_latitude']].describe()

In [None]:
training_set[['dropoff_longitude','dropoff_latitude']].describe()

#### Valid Longitude value = -180 to 180
#### Valid Latitude value = -90 to 90
#### Remove pickup cordinates and drop off cordinates which dont fall in this range 

##### Remove invalid pickup cordinates 

In [None]:
print("Number of invalid pickup cordinates = ",len((training_set[training_set['pickup_latitude']<-90])|(training_set[training_set['pickup_latitude']>90])|(training_set[training_set['pickup_longitude']<-180])|(training_set[training_set['pickup_longitude']>180])))

In [None]:
training_set = training_set.drop(training_set[training_set['pickup_latitude']<-90].index,axis = 0)
training_set = training_set.drop(training_set[training_set['pickup_latitude']>90].index,axis = 0)
training_set = training_set.drop(training_set[training_set['pickup_longitude']<-180].index,axis = 0)
training_set = training_set.drop(training_set[training_set['pickup_longitude']>180].index, axis=0)
training_set.shape

#### Remove invalid drop off cordinates  

In [None]:
print("Number of invalid drop off cordinates = ",len((training_set[training_set['dropoff_latitude']<-90])|(training_set[training_set['dropoff_latitude']>90])|(training_set[training_set['dropoff_longitude']<-180])|(training_set[training_set['dropoff_longitude']>180])))

In [None]:
training_set = training_set.drop(training_set[training_set['dropoff_latitude']<-90].index,axis = 0)
training_set = training_set.drop(training_set[training_set['dropoff_latitude']>90].index,axis = 0)
training_set = training_set.drop(training_set[training_set['dropoff_longitude']<-180].index,axis = 0)
training_set = training_set.drop(training_set[training_set['dropoff_longitude']>180].index, axis=0)
training_set.shape

In [None]:
training_set.describe()

## Distance between pick up location and drop off location
#### The distance in a sphere can be calculated when latitudes and longitudes are given by Haversine formula
#### haversine(θ) = sin²(θ/2)

#### φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km) to include latitude and longitude coordinates (A and B in this case).

#### a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)

#### c = 2 * atan2( √a, √(1−a) )

#### d = R ⋅ c

#### d = Haversine distance

In [None]:
def haversine_distance(data):
    R = 6371        # Radius of earth in Kilo Meter
    
    pickup_data = data[['pickup_longitude','pickup_latitude']]
    dropoff_data = data[['dropoff_longitude','dropoff_latitude']]
    
    phi1 = np.radians(pickup_data['pickup_latitude'])
    phi2 = np.radians(dropoff_data['dropoff_latitude'])
    
    delta_phi = np.radians(dropoff_data['dropoff_latitude'] - pickup_data['pickup_latitude'])
    delta_lambda = np.radians(dropoff_data['dropoff_longitude'] - pickup_data['pickup_longitude'])
    
    #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
    a = np.sin((delta_phi)/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(delta_lambda/2)**2
    
    #c = 2 * atan2( √a, √(1−a) )
    c = 2 * np.arctan2(np.sqrt(a),np.sqrt(1-a))
    d = R * c
    
    return d
    

### Finding Training set distance

In [None]:
training_set_distance = pd.DataFrame(haversine_distance(training_set) , columns=['Distance'])
training_set = pd.concat([training_set , training_set_distance] , axis = 1)

In [None]:
training_set.head()

In [None]:
plt.figure(figsize=(14,6))
plt.hist(training_set['Distance'], bins=40)
plt.xlabel('Distance')
plt.ylabel('Frequency')

In [None]:
training_set.describe()

In [None]:
plt.scatter(training_set['Distance'] , training_set['fare_amount'] , c = 'r' , marker = 'x')
plt.title('Training_set Distance Vs Fare Amount',size = 18)
plt.xlabel('Distance (meters)',size = 14)
plt.ylabel('Fare Amount ($)',size = 14)
plt.show()

### Finding Dev set distance 

In [None]:
dev_set_distance = pd.DataFrame(haversine_distance(dev_set) , columns=['Distance'])
dev_set = pd.concat([dev_set , dev_set_distance] , axis = 1)

In [None]:
dev_set.head()

In [None]:
dev_set.describe()

In [None]:
plt.scatter(dev_set['Distance'] , dev_set['fare_amount'] , c = 'r' , marker = 'x')
plt.title('Dev_set Distance Vs Fare Amount',size = 18)
plt.xlabel('Distance (meters)',size = 14)
plt.ylabel('Fare Amount ($)',size = 14)
plt.show()

## Support Vector Regression on Feature Distance 

In [None]:
def svr_linear(X,y):
    start_time = time.time()
    svr_lin = SVR(kernel='linear', C=7, gamma='auto')
    y_lin = svr_lin.fit(X, y)
    
    return svr_lin, start_time

def svr_RBF(X,y):
    start_time = time.time()
    svr_rbf = SVR(kernel='rbf', C=15, gamma=0.1, epsilon=.1)
    y_rbf = svr_rbf.fit(X, y)
    
    return svr_rbf, start_time

In [None]:
train_data = training_set.sample(n = 10000 , replace = False)
X = np.array(train_data['Distance']).reshape(len(train_data),1)
y = np.array(train_data['fare_amount'])

dev_data = dev_set.sample(n = 4256 , replace = False)
X_predict = np.array(dev_data['Distance']).reshape(len(dev_data),1)
y_predict = np.array(dev_data['fare_amount'])

In [None]:
t1 = pool.apply_async(svr_RBF, (X,y)) 
t2 = pool.apply_async(svr_linear, (X,y)) 

svr_rbf, rbf_start_time = t1.get()
svr_lin, lin_start_time = t2.get()

In [None]:
SVR_RBF_predict = svr_rbf.predict(X_predict)
SVR_Lin_predict = svr_lin.predict(X_predict)

SVR_RBF_mse = mean_squared_error(y_predict, SVR_RBF_predict)
SVR_Lin_mse = mean_squared_error(y_predict, SVR_Lin_predict)

SVR_RBF_rmse = np.sqrt(SVR_RBF_mse)
SVR_Lin_rmse = np.sqrt(SVR_Lin_mse)

print("Size of training data Set = ", len(train_data) ,"and size of validation data set = ", len(dev_data))
print("RMSE of Linear support vector regressor = " , SVR_Lin_rmse)
print("Run time of Linear support vector regressor = " , time.time() - lin_start_time  , "seconds")
print("\nRMSE of RBF support vector regressor = " , SVR_RBF_rmse)
print("Run time of RBF support vector regressor = " , time.time() - rbf_start_time , "seconds")

## From the above experiment we observed that the taxi fare doesn't depend only on distance. So, we need to add other features. 

### Checking data types of each column

In [None]:
training_set.dtypes

### Considering traffic role in calculating fare amount i.e. at what time/day the taxi was hired will say us about that

### Key and pickup_datetime are datetime columns which are in object format. Convert them to datetime format

#### Converting Training set key and pickup_datetime to datetime

In [None]:
#training_set['key'] = pd.to_datetime(training_set['key'])
training_set['pickup_datetime']  = pd.to_datetime(training_set['pickup_datetime'])

In [None]:
training_set.dtypes

In [None]:
training_set.head()

#### Converting Dev set key and pickup_datetime to datetime

In [None]:
#dev_set['key'] = pd.to_datetime(dev_set['key'])
dev_set['pickup_datetime']  = pd.to_datetime(dev_set['pickup_datetime'])

In [None]:
dev_set.dtypes

In [None]:
dev_set.head()

## Spliting the datetime field 'pickup_datetime' to the following -


*   year
*   month
*   date
*   hour
*   day of week

## Using these we shall calculate the day of the week

In [None]:
def date_time_conversion(data):
    time_data = data['pickup_datetime']
  
    year = time_data.dt.year
    month = time_data.dt.month
    date = time_data.dt.day
    hour = time_data.dt.hour
    day_of_week = time_data.dt.dayofweek
  
    values = {'year' : year , 'month' : month , 'date' : date , 'hour' : hour , 'day_of_week' : day_of_week} 
  
    return pd.DataFrame(data=values)

### Finding datetime for training set

In [None]:
training_set_time_value = date_time_conversion(training_set)
training_set = pd.concat([training_set , training_set_time_value] , axis = 1)

In [None]:
training_set.head()

In [None]:
training_set.describe()

### Finding datetime of dev set

In [None]:
dev_set_time_value = date_time_conversion(dev_set)
dev_set = pd.concat([dev_set , dev_set_time_value] , axis = 1)

In [None]:
dev_set.head()

In [None]:
dev_set.describe()

### Support Vector Regression
#### Taking all feature of training_set into considertion  

In [None]:
training_set.columns

In [None]:
dev_set.columns

### Exclude the pickup_datetime columns as datetime columns cannot be directly used while modeling. 


In [None]:
Data_training_set = training_set.drop(['key','pickup_datetime'], axis = 1)
Data_dev_set = dev_set.drop(['key','pickup_datetime'], axis = 1)

In [None]:
Data_training_set.columns

In [None]:
Data_dev_set.columns

In [None]:
training_data = Data_training_set.sample(n=10000 , replace = False)
x_train = np.array(training_data[[ 'Distance']])
y_train = np.array(training_data['fare_amount'])

dev_data = Data_dev_set.sample(n=1000 , replace = False)
x_val = np.array(dev_data[['Distance']])
y_val = np.array(dev_data['fare_amount'])

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_val.shape

In [None]:
y_val.shape

In [None]:
pool = ThreadPool(processes=1)

t3 = pool.apply_async(svr_RBF, (x_train,y_train)) 
t4 = pool.apply_async(svr_linear, (x_train,y_train)) 

svr_rbf, rbf_start_time = t3.get()
svr_lin, lin_start_time = t4.get()

In [None]:
SVR_RBF_predict = svr_rbf.predict(x_val)
SVR_Lin_predict = svr_lin.predict(x_val)

SVR_RBF_mse = mean_squared_error(y_val, SVR_RBF_predict)
SVR_Lin_mse = mean_squared_error(y_val, SVR_Lin_predict)

SVR_RBF_rmse = np.sqrt(SVR_RBF_mse)
SVR_Lin_rmse = np.sqrt(SVR_Lin_mse)

print("RMSE of Linear support vector regressor = " , SVR_Lin_rmse)
print("Run time of Linear support vector regressor = " , time.time() - lin_start_time , "seconds")
print("\nRMSE of RBF support vector regressor = " , SVR_RBF_rmse)
print("Run time of RBF support vector regressor = " , time.time() - rbf_start_time , "seconds")