In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading data

In [3]:
df = pd.read_parquet('/content/drive/MyDrive/Data/BMTC.parquet.gzip', engine='pyarrow') # This command loads BMTC data into a dataframe. 
                                                                      # In case of error, install pyarrow using: 
                                                                      # pip install pyarrow
dfInput = pd.read_csv('/content/drive/MyDrive/Data/Input.csv')
dfGroundTruth = pd.read_csv('/content/drive/MyDrive/Data/GroundTruth.csv')

# EDA

In [4]:
g1=df.groupby('BusID')
unique=df.BusID.unique()

In [5]:
d1 = df.drop_duplicates(subset=['Latitude','Longitude','Speed'],keep=("first"),inplace=False)
d2 = df.drop_duplicates(subset=['Latitude','Longitude','Speed'],keep=("last"),inplace=False)
d3 = pd.concat([d1,d2.loc[set(d2.index) - set(d1.index)]])

In [6]:
import numpy as np
#haversine
from numpy import radians, cos, sin, arcsin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    #Convert decimal degrees to Radians:
    lon1 = np.radians(lon1.values)
    lat1 = np.radians(lat1.values)
    lon2 = np.radians(lon2.values)
    lat2 = np.radians(lat2.values)

    #Implementing Haversine Formula: 
    dlon = np.subtract(lon2, lon1)
    dlat = np.subtract(lat2, lat1)

    a = np.add(np.power(np.sin(np.divide(dlat, 2)), 2),  
                          np.multiply(np.cos(lat1), 
                                      np.multiply(np.cos(lat2), 
                                                  np.power(np.sin(np.divide(dlon, 2)), 2))))
    c = np.multiply(2, np.arcsin(np.sqrt(a)))
    r = 6371

    return c*r

In [7]:
from datetime import datetime
def time_delta(initial,final):
                       # Now
  final1=datetime(final)
  initial1=datetime(initial)
  duration = final1-initial1                        # For build-in functions
  duration_in_s = duration.total_seconds()
        # Total number of seconds between dates
  return duration_in_s

In [8]:
d4=d3.loc[:,["Latitude",'Longitude']]
d7=d3.shift(1)
d5=d4.shift(periods=1, freq=None, axis=0)
d5.rename(columns = {'Latitude':'Source_Lat', 'Longitude':'Source_Long'}, inplace = True)
d4.rename(columns = {'Latitude':'Dest_Lat', 'Longitude':'Dest_Long'}, inplace = True)
d6=pd.concat([d5,d4], axis=1)
distance=haversine(d4['Dest_Long'],d4['Dest_Lat'],d5['Source_Long'],d5['Source_Lat'])
time=d3['Timestamp']-d7['Timestamp']
time=time.apply(lambda x: x.seconds/60)
d6['Distance']=distance
d6['Duration']=time


# speed=d6['Distance']/d6['Duration']
# d6['Speed']=speed




In [9]:

d6.columns

d6.columns

Index(['Source_Lat', 'Source_Long', 'Dest_Lat', 'Dest_Long', 'Distance',
       'Duration'],
      dtype='object')

In [10]:
d6.head()

Unnamed: 0,Source_Lat,Source_Long,Dest_Lat,Dest_Long,Distance,Duration
0,,,13.074558,77.445549,,
152,13.074558,77.445549,13.074558,77.445549,0.0,25.233333
153,13.074558,77.445549,13.074113,77.445282,0.057313,0.166667
154,13.074113,77.445282,13.07406,77.445267,0.006113,0.166667
173,13.07406,77.445267,13.07406,77.445267,0.0,3.15


In [11]:
d6.replace([np.inf, -np.inf], np.nan, inplace=True)
d6.drop(d6.tail(2).index,
        inplace = True)
d6.drop(d6.head(2).index,
        inplace = True)

pd.set_option('mode.use_inf_as_na', True)
d6.dropna(how='any', inplace=True)
# check = d6[d6.isna().any(axis=1)]
# check
# yy=d6['Duration']
# d6.drop(['Duration'],axis=1)

In [12]:
d6

Unnamed: 0,Source_Lat,Source_Long,Dest_Lat,Dest_Long,Distance,Duration
153,13.074558,77.445549,13.074113,77.445282,0.057313,0.166667
154,13.074113,77.445282,13.074060,77.445267,0.006113,0.166667
173,13.074060,77.445267,13.074060,77.445267,0.000000,3.150000
174,13.074060,77.445267,13.074060,77.445267,0.000000,0.166667
175,13.074060,77.445267,13.074060,77.445267,0.000000,0.166667
...,...,...,...,...,...,...
3145719,12.916972,77.586678,13.019111,77.501274,14.650288,300.516667
1572856,13.019111,77.501274,12.916972,77.586678,14.650288,1139.800000
1572857,12.916972,77.586678,12.916972,77.586678,0.000000,0.166667
1572858,12.916972,77.586678,12.916972,77.586678,0.000000,0.166667


In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
X_train=d6.drop(['Duration'],axis=1)

In [15]:
X_train

Unnamed: 0,Source_Lat,Source_Long,Dest_Lat,Dest_Long,Distance
153,13.074558,77.445549,13.074113,77.445282,0.057313
154,13.074113,77.445282,13.074060,77.445267,0.006113
173,13.074060,77.445267,13.074060,77.445267,0.000000
174,13.074060,77.445267,13.074060,77.445267,0.000000
175,13.074060,77.445267,13.074060,77.445267,0.000000
...,...,...,...,...,...
3145719,12.916972,77.586678,13.019111,77.501274,14.650288
1572856,13.019111,77.501274,12.916972,77.586678,14.650288
1572857,12.916972,77.586678,12.916972,77.586678,0.000000
1572858,12.916972,77.586678,12.916972,77.586678,0.000000


In [16]:
y_train=d6['Duration']

In [17]:
y_train

153           0.166667
154           0.166667
173           3.150000
174           0.166667
175           0.166667
              ...     
3145719     300.516667
1572856    1139.800000
1572857       0.166667
1572858       0.166667
3670005     299.700000
Name: Duration, Length: 4434304, dtype: float64

In [18]:
test_df=pd.read_csv('/content/drive/MyDrive/Data/Input.csv')
distance1=haversine(test_df['Dest_Long'],test_df['Dest_Lat'],test_df['Source_Long'],test_df['Source_Lat'])
test_df['Distance']=distance1
print(test_df.columns)
test_df.drop("Unnamed: 0",axis=1,inplace=True)

X_test=test_df

Index(['Unnamed: 0', 'Source_Lat', 'Source_Long', 'Dest_Lat', 'Dest_Long',
       'Distance'],
      dtype='object')


In [19]:
X_test
X = X_test['Distance'].isna()
c=0
for i in X:
  if i :
    print(X_test[c])
  c+=1
print(c)


1205


In [20]:
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
# X_test.drop(X_test.tail(2).index,
#         inplace = True)
# X_test.drop(X_test.head(2).index,
#         inplace = True)

#pd.set_option('mode.use_inf_as_na', True)


In [21]:
y_test=pd.read_csv('/content/drive/MyDrive/Data/GroundTruth.csv')
print(y_test.columns)
y_test.drop("Unnamed: 0",axis=1,inplace=True)

Index(['Unnamed: 0', 'TT'], dtype='object')


In [22]:
X_test=pd.concat([X_test,y_test], axis=1)
X_test.dropna(how='any', inplace=True)

In [23]:
y_test=X_test['TT']
X_test.drop('TT',axis=1,inplace=True)

# Linear Regression Model

In [24]:
reg_model=LinearRegression()

In [25]:
reg_model.fit(X_train, y_train)

LinearRegression()

In [26]:
print(reg_model.score(X_test, y_test))

-185.51197284254383


# Random Forest Model

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import time


In [71]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,ytrain,ytest = train_test_split(X_train,y_train, train_size = 0.006, random_state=42) 

In [72]:
forest_model = RandomForestRegressor(random_state=1,oob_score=True,)
forest_model.fit(Xtrain,ytrain)
print("yes")

yes


In [73]:
melb_preds = forest_model.predict(X_test)
print(mean_absolute_error(y_test, melb_preds))

485.8630272475795
