<a href="https://colab.research.google.com/github/Ishmeet7/Ishme_DS_242EX/blob/main/Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and Importing packages

In [None]:
import pandas as pd #importing libraries
import numpy as np
from numpy import radians, cos, sin, arcsin, sqrt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive   #mount drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading data

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/Data/BMTC.parquet.gzip', engine='pyarrow') # This command loads BMTC data into a dataframe. 
                                                                      # In case of error, install pyarrow using: 
                                                                      # pip install pyarrow
dfInput = pd.read_csv('/content/drive/MyDrive/Data/Input.csv')
dfGroundTruth = pd.read_csv('/content/drive/MyDrive/Data/GroundTruth.csv')

In [None]:
df


Unnamed: 0,BusID,Latitude,Longitude,Speed,Timestamp
0,150212121,13.074558,77.445549,0.0,2019-08-01 07:00:02
1,150212121,13.074558,77.445549,0.0,2019-08-01 07:00:12
2,150212121,13.074558,77.445549,0.0,2019-08-01 07:00:22
3,150212121,13.074558,77.445549,0.0,2019-08-01 07:00:32
4,150212121,13.074558,77.445549,0.0,2019-08-01 07:00:42
...,...,...,...,...,...
6014941,150813395,13.097612,77.565689,30.0,2019-08-01 18:59:20
6014942,150813395,13.097538,77.564873,34.0,2019-08-01 18:59:30
6014943,150813395,13.097462,77.564049,38.0,2019-08-01 18:59:40
6014944,150813395,13.097604,77.563217,21.0,2019-08-01 18:59:50


# Exploratory Data Analysis

In [None]:
g1=df.groupby('BusID') #grouping data using BusID
unique=df.BusID.unique()

# Preprocessing

In [None]:
def pre_processing(df):
  d1 = df.drop_duplicates(subset=['Latitude','Longitude','Speed'],keep=("first"),inplace=False) #If the consecutive rows have same latitude,longitude and speed values keeping only first and last row
  d2 = df.drop_duplicates(subset=['Latitude','Longitude','Speed'],keep=("last"),inplace=False)
  d3 = pd.concat([d1,d2.loc[set(d2.index) - set(d1.index)]])

# Feature Extraction

In [None]:
#calculating Haversine distance between two points on earth
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    #Convert decimal degrees to Radians:
    lon1 = np.radians(lon1.values)
    lat1 = np.radians(lat1.values)
    lon2 = np.radians(lon2.values)
    lat2 = np.radians(lat2.values)

    #Implementing Haversine Formula: 
    dlon = np.subtract(lon2, lon1)
    dlat = np.subtract(lat2, lat1)

    a = np.add(np.power(np.sin(np.divide(dlat, 2)), 2),  
                          np.multiply(np.cos(lat1), 
                                      np.multiply(np.cos(lat2), 
                                                  np.power(np.sin(np.divide(dlon, 2)), 2))))
    c = np.multiply(2, np.arcsin(np.sqrt(a)))
    r = 6371
    
    return c*r

In [None]:
d4=d3.loc[:,["Latitude",'Longitude']]
d7=d3.shift(1)             #creating a lag of p=1
d5=d4.shift(periods=1, freq=None, axis=0)

d5.rename(columns = {'Latitude':'Source_Lat', 'Longitude':'Source_Long'}, inplace = True) #concatenating shifted dataframe to original dataframe and renaming columns
d4.rename(columns = {'Latitude':'Dest_Lat', 'Longitude':'Dest_Long'}, inplace = True)
d6=pd.concat([d5,d4], axis=1)

distance=haversine(d4['Dest_Long'],d4['Dest_Lat'],d5['Source_Long'],d5['Source_Lat'])

time=d3['Timestamp']-d7['Timestamp'] #calculating time required to travel distance between two consecutive rows of latitude and logitude

time=time.apply(lambda x: x.seconds/60)

d6['Distance']=distance   #adding new columns to dataframe
d6['Duration']=time

In [None]:
d6.columns

Index(['Source_Lat', 'Source_Long', 'Dest_Lat', 'Dest_Long', 'Distance',
       'Duration'],
      dtype='object')

In [None]:
d6.head() #created dataframe

Unnamed: 0,Source_Lat,Source_Long,Dest_Lat,Dest_Long,Distance,Duration
0,,,13.074558,77.445549,,
152,13.074558,77.445549,13.074558,77.445549,0.0,25.233333
153,13.074558,77.445549,13.074113,77.445282,0.057313,0.166667
154,13.074113,77.445282,13.07406,77.445267,0.006113,0.166667
173,13.07406,77.445267,13.07406,77.445267,0.0,3.15


In [None]:
d6.replace([np.inf, -np.inf], np.nan, inplace=True)    #replacing very large values with Nan
d6.drop(d6.tail(2).index, #dropping the first and last 2 rows from dataframe
        inplace = True)
d6.drop(d6.head(2).index,
        inplace = True)

pd.set_option('mode.use_inf_as_na', True)
d6.dropna(how='any', inplace=True)       #dropping rows with Nan values from new dataframe
# check = d6[d6.isna().any(axis=1)]

# d6.drop(['Duration'],axis=1)

In [None]:
d6

In [None]:
X_train=d6.drop(['Duration'],axis=1) #creating training dataset 
#dropping Duration attribute

In [None]:
X_train

In [None]:
y_train=d6['Duration']  #creating target variable

In [None]:
y_train

In [None]:
test_df=dfInput

distance1=haversine(test_df['Dest_Long'],test_df['Dest_Lat'],test_df['Source_Long'],test_df['Source_Lat'])

test_df['Distance']=distance1         #creating distance attribute in testing csv
print(test_df.columns)

test_df.drop("Unnamed: 0",axis=1,inplace=True) #dropping extra column from testing csv

X_test=test_df

In [None]:
X_test.replace([np.inf, -np.inf], np.nan, inplace=True) #replacing very large values with Nan


In [None]:
X_test      #checking for Nan value of distance in testing csv
X = X_test['Distance'].isna()
c=0
for i in X:
  if i :
    print(X_test[c])
  c+=1
print(c)


In [None]:
y_test=dfGroundTruth
print(y_test.columns)
# y_test.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
X_test=pd.concat([X_test,y_test], axis=1)  
X_test.dropna(how='any', inplace=True) #dropping  rows containing Nan values from dataset

In [None]:
y_test=X_test['TT']
X_test.drop('TT',axis=1)

# Linear Regression Model

In [None]:
reg_model=LinearRegression() 

In [None]:
reg_model.fit(X_train, y_train)

In [None]:
X_test.columns

In [None]:
X_test.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
y_test=X_test['TT']

In [None]:
X_test.drop("TT",axis=1,inplace=True)

In [None]:
print(reg_model.score(X_test, y_test))

# Random Forest Model

In [None]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X_train,y_train, train_size = 0.006, random_state=42) 

In [None]:
forest_model = RandomForestRegressor(random_state=1,oob_score=True,)
forest_model.fit(Xtrain,ytrain)
#print("yes")

RandomForestRegressor(oob_score=True, random_state=1)

In [None]:
pred = forest_model.predict(X_test)
print("Mean absolute error obtained is:",mean_absolute_error(y_test, pred))

Mean absolute error obtained is: 485.8630272475795


In [None]:
#dfInput['ETT']=pred

In [None]:
#dfInput

Unnamed: 0,Source_Lat,Source_Long,Dest_Lat,Dest_Long,Distance,ETT
0,12.941644,77.557335,12.942002,77.551605,0.622237,3.092333
1,12.845487,77.662079,12.845881,77.667892,0.631720,31.854167
2,12.973492,77.622871,12.957303,77.621246,1.808726,470.183333
3,12.819298,77.688995,12.814241,77.692986,0.709538,25.009833
4,12.973240,77.615402,13.016170,77.627800,4.958999,693.851667
...,...,...,...,...,...,...
1200,13.097792,77.591736,13.154475,77.568077,6.803679,598.720500
1201,12.794702,77.624046,12.863790,77.616608,7.724445,631.248000
1202,12.922270,77.743591,12.983501,77.752258,6.873050,669.910167
1203,12.918303,77.589500,12.923568,77.655655,7.193703,847.488667


In [None]:
dfGroundTruth

Unnamed: 0.1,Unnamed: 0,TT
0,0,2.833333
1,1,1.500000
2,2,21.250000
3,3,2.000000
4,4,35.733333
...,...,...
1200,1200,27.233333
1201,1201,26.750000
1202,1202,53.600000
1203,1203,59.616667


# For Evaluation
function for evauation
  2. Function arguments:
    
    a. df: It is a pandas dataframe that contains the data from BMTC.parquet.gzip
   
    b. dfInput: It is a pandas dataframe that contains the input from Input.csv

3. Returns:

    a. dfOutput: It is a pandas dataframe that contains the output


In [None]:
def EstimatedTravelTime(df, dfInput): # The output of this function will be evaluated
    # Function body - Begins
    # Make changes here.  
                              
    dfOutput = pd.DataFrame()


    # Function body - Ends
    return dfOutput

In [None]:
dfOutput = EstimatedTravelTime(df, dfInput)

# AutoML

In [None]:
#!apt install -y build-essential swig curl

In [None]:
# !pip install auto-sklearn

In [None]:
# Install packages
# !curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
# !pip install auto-sklearn

In [None]:
# from autosklearn.regression import AutoSklearnRegressor
# # Create the AutoSklearnRegessor
# sklearn = AutoSklearnRegressor(time_left_for_this_task=360,per_run_time_limit=30,memory_limit=5000,n_jobs=-1)
# # Fit the training data
# sklearn.fit(Xtrain, ytrain)
# # Sprint Statistics
# print(sklearn.sprint_statistics())
# # Predict the validation data
# pred_sklearn = sklearn.predict(X_test)
# # Compute the RMSE
# rmse_sklearn=MSE(y_test, pred_sklearn)**0.5
# print('RMSE: ' + str(rmse_sklearn))

In [None]:
# sklearn.get_models_with_weights()

In [None]:
# sklearn.leaderboard()

In [None]:
# from autosklearn.regression import AutoSklearnRegressor
# # Create the AutoSklearnRegessor
# sklearn = AutoSklearnRegressor(time_left_for_this_task=360,per_run_time_limit=45,memory_limit=5500,n_jobs=-1)
# # Fit the training data
# sklearn.fit(Xtrain, ytrain)
# # Sprint Statistics
# print(sklearn.sprint_statistics())
# # Predict the validation data
# pred_sklearn = sklearn.predict(X_test)
# # Compute the RMSE
# rmse_sklearn=MSE(y_test, pred_sklearn)**0.5
# print('RMSE: ' + str(rmse_sklearn))

In [None]:
# sklearn.leaderboard()

In [None]:
# from autosklearn.regression import AutoSklearnRegressor
# # Create the AutoSklearnRegessor
# sklearn = AutoSklearnRegressor(time_left_for_this_task=36000,per_run_time_limit=3600,memory_limit=5000,n_jobs=-1)
# # Fit the training data
# sklearn.fit(Xtrain, ytrain)
# # Sprint Statistics
# print(sklearn.sprint_statistics())
# # Predict the validation data
# pred_sklearn = sklearn.predict(X_test)
# # Compute the RMSE
# rmse_sklearn=MSE(y_test, pred_sklearn)**0.5
# print('RMSE: ' + str(rmse_sklearn))

In [None]:
# import matplotlib.pyplot as plt 
# # Scatter plot true and predicted values
# plt.scatter(pred_sklearn, y_val, alpha=0.2)
# plt.xlabel('predicted')
# plt.ylabel('true value')