<a href="https://colab.research.google.com/github/Ishmeet7/Ishme_DS_242EX/blob/main/Predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and Importing packages

In [1]:
import pandas as pd #importing libraries
import numpy as np
import matplotlib.pyplot as plt
from numpy import radians, cos, sin, arcsin, sqrt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
import seaborn as sns

In [None]:
from google.colab import drive   #mount drive
drive.mount('/content/drive')

# Loading data

In [None]:
df = pd.read_parquet('/content/drive/MyDrive/Data/BMTC.parquet.gzip', engine='pyarrow') # This command loads BMTC data into a dataframe. 
                                                                      # In case of error, install pyarrow using: 
                                                                      # pip install pyarrow
dfInput = pd.read_csv('/content/drive/MyDrive/Data/Input.csv')
dfGroundTruth = pd.read_csv('/content/drive/MyDrive/Data/GroundTruth.csv')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#g=df.groupby('BusID', as_index=False)["Timestamp"].sort_values()

#g["Timestamp"], format="%Y-%m-%d %H:%M:%S").sort_values()

In [None]:
dfGroundTruth

In [None]:
#df = df[(df['Latitude']>12.95)&(df['Latitude']<=13.00)&(df['Longitude']>77.55)&(df['Longitude']<=77.60)]


In [None]:
df

# Exploratory Data Analysis

In [None]:
g1=df.groupby('BusID') #grouping data using BusID
unique=df.BusID.unique()

In [None]:
plt.scatter(df["Latitude"],df["Longitude"])

# Preprocessing

In [None]:
def pre_processing(df):
  d1 = df.drop_duplicates(subset=['Latitude','Longitude','Speed'],keep=("first"),inplace=False) #If the consecutive rows have same latitude,longitude and speed values keeping only first and last row
  d2 = df.drop_duplicates(subset=['Latitude','Longitude','Speed'],keep=("last"),inplace=False)
  d3 = pd.concat([d1,d2]).drop_duplicates()
  return d3

In [None]:
pre_processing(df)

## Creating a Subset of Data Based on similar route

In [None]:
g = df.groupby('BusID')

bus1 = g.get_group(150218010)
bus2 = g.get_group(150813389)

bus1.reset_index(inplace = True, drop = True)
bus2.reset_index(inplace = True, drop = True)

buses = pd.concat([bus1,bus2],axis=0)

lat = np.array(buses['Latitude']).reshape(-1, 1)
poly_feature = PolynomialFeatures(4)#degree is 4
x_poly = poly_feature.fit_transform(lat)#fit data and then transform
reg = LinearRegression()
reg.fit(x_poly, buses['Longitude'])#fit linear model
y_predict = reg.predict(x_poly)#prediction using model
rmse = (mse(buses['Longitude'], y_predict)) ** 0.5  #rmse calculation

count = 0
cluster = pd.DataFrame()
for i in g.size().index:
    bus = g.get_group(i)
    bus.reset_index(inplace = True, drop = True)
    lat = np.array(bus['Latitude']).reshape(-1, 1)
    x_poly = poly_feature.fit_transform(lat)
    y_predict = reg.predict(x_poly)#prediction using model
    rmse = (mse(bus['Longitude'], y_predict)) ** 0.5  #rmse calculation
    if rmse <= 0.03:
        count+=1
        cluster = pd.concat([cluster,bus], axis=0)
        
print(count)


plt.figure(figsize=(10,10))
plt.scatter(cluster['Latitude'], cluster['Longitude'])
xp = np.linspace(cluster['Latitude'].min(), cluster['Latitude'].max(), 1000).reshape(-1, 1)#2923 is len of train data
x_poly = poly_feature.fit_transform(xp)
y_predict = reg.predict(x_poly)
plt.plot(xp,y_predict,color='k')

In [None]:
dfInput.columns

In [None]:
for i in range(40,50):
    plt.plot([dfInput['Source_Lat'][i],dfInput['Dest_Lat'][i]],[dfInput['Source_Long'][i],dfInput['Dest_Long'][i]])


# Feature Extraction

In [None]:
#calculating Haversine distance between two points on earth
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    #Convert decimal degrees to Radians:
    lon1 = np.radians(lon1.values)
    lat1 = np.radians(lat1.values)
    lon2 = np.radians(lon2.values)
    lat2 = np.radians(lat2.values)

    #Implementing Haversine Formula: 
    dlon = np.subtract(lon2, lon1)
    dlat = np.subtract(lat2, lat1)

    a = np.add(np.power(np.sin(np.divide(dlat, 2)), 2),  
                          np.multiply(np.cos(lat1), 
                                      np.multiply(np.cos(lat2), 
                                                  np.power(np.sin(np.divide(dlon, 2)), 2))))
    c = np.multiply(2, np.arcsin(np.sqrt(a)))
    r = 6371
    
    return c*r

In [None]:
d3 = cluster
d4=d3.loc[:,["Latitude",'Longitude']]
d7=d3.shift(1)             #creating a lag of p=1
d5=d4.shift(periods=1, freq=None, axis=0)

d5.rename(columns = {'Latitude':'Source_Lat', 'Longitude':'Source_Long'}, inplace = True) #concatenating shifted dataframe to original dataframe and renaming columns
d4.rename(columns = {'Latitude':'Dest_Lat', 'Longitude':'Dest_Long'}, inplace = True)
d6=pd.concat([d5,d4], axis=1)

distance=haversine(d4['Dest_Long'],d4['Dest_Lat'],d5['Source_Long'],d5['Source_Lat'])

time=d3['Timestamp']-d7['Timestamp'] #calculating time required to travel distance between two consecutive rows of latitude and logitude

time=time.apply(lambda x: x.seconds/60)

d6['Distance']=distance   #adding new columns to dataframe
d6['Duration']=time

In [None]:
d6.columns

In [None]:
d6.info()

In [None]:
d6.describe()

In [None]:
d6.head() #created dataframe

In [None]:
d6.replace([np.inf, -np.inf], np.nan, inplace=True)    #replacing very large values with Nan
d6.drop(d6.tail(1).index, #dropping the first and last rows from dataframe
        inplace = True)
d6.drop(d6.head(1).index,
        inplace = True)

pd.set_option('mode.use_inf_as_na', True)
d6.dropna(how='any', inplace=True)       #dropping rows with Nan values from new dataframe
# check = d6[d6.isna().any(axis=1)]

# d6.drop(['Duration'],axis=1)

In [None]:
d6.reset_index(inplace=True)
for i in range(500):
    plt.plot([d6['Source_Lat'][i],d6['Dest_Lat'][i]],[d6['Source_Long'][i],d6['Dest_Long'][i]])

In [None]:
X_train=d6.drop(['Duration'],axis=1) #creating training dataset 
#dropping Duration attribute

In [None]:
X_train

In [None]:
y_train=d6['Duration']  #creating target variable

In [None]:
y_train

In [None]:
d6.to_parquet('data.parquet.gzip',engine='pyarrow',compression='gzip')

In [None]:
sns.heatmap(d6.corr(method ='pearson'),annot=True, fmt=".4f",cmap="crest")

In [None]:
# d6[d6['Duration']>=720]
# len(d6[d6['Distance']>=30])
out = pd.cut(d6['Distance'], bins=[-0.1, 0, 0.05, 0.1, 0.2, 0.4, 1, 2, 5, 10, 20, 40, 50], include_lowest=True)
out1 = pd.cut(d6['Duration'], bins=[-0.1, 0, 0.1, 0.5, 1, 4, 10, 20, 50, 90, 180, 300, 500, 800, 1200, 1500], include_lowest=True)
out1.value_counts()

In [None]:
d6["Distance"].max(),d6["Distance"].min()

In [None]:
!pip install scipy

In [None]:

import scipy
scipy.stats.skew(d6, axis = 0, bias = True)

In [None]:
out.value_counts()

In [None]:
# use this method to create the test data along with the labels
def create_test(input,truth):      # pass the X labels and y labels as parameters
    test_df = input 
    distance1=haversine(test_df['Dest_Long'],test_df['Dest_Lat'],test_df['Source_Long'],test_df['Source_Lat'])
    test_df['Distance']=distance1         #creating distance attribute
    test_df.replace([np.inf, -np.inf], np.nan, inplace=True)      #replacing very large values with Nan
    if "Unnamed: 0" in truth.columns:
        truth.drop("Unnamed: 0",axis=1,inplace=True)
    if "Unnamed: 0" in test_df.columns:
        test_df.drop("Unnamed: 0",axis=1,inplace=True)
    test = pd.concat([test_df,truth],axis=1)
    return test


In [None]:
test_df=dfInput

distance1=haversine(test_df['Dest_Long'],test_df['Dest_Lat'],test_df['Source_Long'],test_df['Source_Lat'])

test_df['Distance']=distance1         #creating distance attribute in testing csv

# test_df.drop("Unnamed: 0",axis=1,inplace=True) #dropping extra column from testing csv
print(test_df.columns)

X_test=test_df

In [None]:
test_df["Distance"].mean()
test_df["Distance"].median()
print(test_df["Distance"].max())
print(test_df["Distance"].min())

In [None]:
X_test.replace([np.inf, -np.inf], np.nan, inplace=True) #replacing very large values with Nan


In [None]:
#checking for Nan value of distance in testing csv
X = X_test['Distance'].isna()
c=0
for i in X:
  if i :
    print(X_test[c])
  c+=1
print(c)


In [None]:
y_test=dfGroundTruth
print(y_test.columns)
# y_test.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
y_test=y_test['TT']
# X_test.drop('TT',axis=1)

In [None]:
test=pd.concat([X_test,y_test], axis=1)  
# X_test.dropna(how='any', inplace=True) #dropping  rows containing Nan values from dataset

In [None]:
test
speed=test["Distance"]/test["TT"]
speed.min()

# Linear Regression Model

In [None]:
reg_model=LinearRegression() 
reg_model.fit(X_train, y_train)

In [None]:
#test = test[(test['Source_Lat']>12.95)&(test['Source_Lat']<=13.00)&(test['Source_Long']>77.55)&(test['Source_Long']<=77.60)]

In [None]:
test.columns

In [None]:
y_test=test['TT']

In [None]:
test.corr(method ='pearson')

In [None]:
test.drop("TT",axis=1,inplace=True)

In [None]:
test.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
print("rmse",reg_model.score(test, y_test))

In [None]:

pred = reg_model.predict(test)
print("mae = ",mean_absolute_error(y_test,pred))

In [None]:
pred = pd.DataFrame(pred)
check = pd.concat([y_test,pred],axis=1)
check = check.sort_values(by='TT')
check
plt.figure(figsize=(10,10))
plt.plot(check['TT'],check[0])


# Random Forest Model

In [None]:
#Xtrain,Xtest,ytrain,ytest = train_test_split(X_train,y_train, train_size = 0.6, random_state=42,shuffle=False) 

In [None]:
# forest_model = RandomForestRegressor(random_state=1,oob_score=True,)
# forest_model.fit(X_train, y_train)
# #print("yes")

In [None]:
forest_model = RandomForestRegressor(random_state=1,oob_score=True,)
forest_model.fit(X_train.head(18000), y_train.head(18000))

In [None]:
pred = forest_model.predict(test)
print("Mean absolute error obtained is:",mean_absolute_error(y_test, pred))

In [None]:
#dfInput['ETT']=pred

In [None]:
#dfInput

In [None]:
dfGroundTruth

# For Evaluation
function for evauation
  2. Function arguments:
    
    a. df: It is a pandas dataframe that contains the data from BMTC.parquet.gzip
   
    b. dfInput: It is a pandas dataframe that contains the input from Input.csv

3. Returns:

    a. dfOutput: It is a pandas dataframe that contains the output


In [None]:
def EstimatedTravelTime(df, dfInput): # The output of this function will be evaluated
    # Function body - Begins
    # Make changes here.  
                              
    dfOutput = pd.DataFrame()


    # Function body - Ends
    return dfOutput

In [None]:
dfOutput = EstimatedTravelTime(df, dfInput)

In [None]:
bus_df= pd.read_parquet('/content/drive/MyDrive/Data/BMTC.parquet.gzip', engine='pyarrow')
# bus_df=bus_df.drop(["BusID","Speed"],axis=1)
# source=bus_df.sample(n=400,replace=True,random_state=1)

# source.rename(columns = {'Latitude':'Source_Lat',"Longitude":"Source_Long","Timestamp":"t1"}, inplace = True)
# source.reset_index(inplace = True)
# destination=bus_df.sample(n=400,random_state=2)
# destination.reset_index(inplace = True)
# destination.rename(columns = {'Latitude':'Dest_Lat',"Longitude":"Dest_Long","Timestamp":"t2"}, inplace = True)
# merge_df=pd.concat([source,destination],axis=1)
#merge_df=pd.merge(source,destination,axis=1,)

In [None]:
#bus_df=bus_df.groupby('BusID')

In [None]:
# merge_df.columns
# merge_df["Distance"]=haversine(merge_df['Dest_Long'],merge_df['Dest_Lat'],merge_df['Source_Long'],merge_df['Source_Lat'])
# time=merge_df
# merge_df["Duration"]=

In [None]:
bus_df.drop("Speed",axis=1,inplace=True)

In [None]:

bus_df=bus_df.groupby('BusID').apply(lambda x: x.sample(2)).reset_index(drop=True)

bus_df.rename(columns = {'Latitude':'Source_Lat', 'Longitude':'Source_Long',"Timestamp":"t1"}, inplace = True) 

shifted_df=bus_df.shift(1)
shifted_df.rename(columns = {'Source_Lat':'Dest_Lat', 'Source_Long':'Dest_Long',"t1":"t2"}, inplace = True)
final_df=pd.concat([bus_df,shifted_df], axis=1)


In [None]:
final_df=final_df.iloc[1::2].reset_index()

In [None]:

time=abs(final_df["t2"]-final_df["t1"])

final_df["Distance"]=haversine(final_df['Dest_Long'],final_df['Dest_Lat'],final_df['Source_Long'],final_df['Source_Lat'])
final_df["Duration"]=time.apply(lambda x: x.seconds/60)

In [None]:
test_df=final_df.drop(columns=["index","BusID","t1","BusID","t2"])

In [None]:
final_df

In [None]:
test_df

In [None]:
test_df["speed"]=test_df["Distance"]/test_df["Duration"] #speed in km/min

In [None]:
test_df

In [None]:
#test_df=test_df.sort_values(by=["speed"],axis=0,ascending=False,na_position="last",ignore_index=True)

In [None]:
test_df=test_df[test_df["speed"]>0.02] #0.02km/min =1.2km/hr

In [None]:
test_df.reset_index()

In [None]:
bus_df= pd.read_parquet('/content/drive/MyDrive/Data/BMTC.parquet.gzip', engine='pyarrow')

In [None]:
g=bus_df.groupby('BusID')

In [None]:
import random 
# random.sample(range(1, 50), 7)

In [None]:
random_df=g.nth([100,600])

# AutoML

In [None]:
#!apt install -y build-essential swig curl

In [None]:
# !pip install auto-sklearn

In [None]:
# Install packages
# !curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
# !pip install auto-sklearn

In [None]:
# from autosklearn.regression import AutoSklearnRegressor
# # Create the AutoSklearnRegessor
# sklearn = AutoSklearnRegressor(time_left_for_this_task=360,per_run_time_limit=30,memory_limit=5000,n_jobs=-1)
# # Fit the training data
# sklearn.fit(Xtrain, ytrain)
# # Sprint Statistics
# print(sklearn.sprint_statistics())
# # Predict the validation data
# pred_sklearn = sklearn.predict(X_test)
# # Compute the RMSE
# rmse_sklearn=MSE(y_test, pred_sklearn)**0.5
# print('RMSE: ' + str(rmse_sklearn))

In [None]:
# sklearn.get_models_with_weights()

In [None]:
# sklearn.leaderboard()

In [None]:
# from autosklearn.regression import AutoSklearnRegressor
# # Create the AutoSklearnRegessor
# sklearn = AutoSklearnRegressor(time_left_for_this_task=360,per_run_time_limit=45,memory_limit=5500,n_jobs=-1)
# # Fit the training data
# sklearn.fit(Xtrain, ytrain)
# # Sprint Statistics
# print(sklearn.sprint_statistics())
# # Predict the validation data
# pred_sklearn = sklearn.predict(X_test)
# # Compute the RMSE
# rmse_sklearn=MSE(y_test, pred_sklearn)**0.5
# print('RMSE: ' + str(rmse_sklearn))

In [None]:
# sklearn.leaderboard()

In [None]:
# from autosklearn.regression import AutoSklearnRegressor
# # Create the AutoSklearnRegessor
# sklearn = AutoSklearnRegressor(time_left_for_this_task=36000,per_run_time_limit=3600,memory_limit=5000,n_jobs=-1)
# # Fit the training data
# sklearn.fit(Xtrain, ytrain)
# # Sprint Statistics
# print(sklearn.sprint_statistics())
# # Predict the validation data
# pred_sklearn = sklearn.predict(X_test)
# # Compute the RMSE
# rmse_sklearn=MSE(y_test, pred_sklearn)**0.5
# print('RMSE: ' + str(rmse_sklearn))

In [None]:
# import matplotlib.pyplot as plt 
# # Scatter plot true and predicted values
# plt.scatter(pred_sklearn, y_val, alpha=0.2)
# plt.xlabel('predicted')
# plt.ylabel('true value')