# **Real Time Data Processing**

In [1]:
#import reqired libraries
import math
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
real_time_data = pd.read_csv("../Raw GPS data Kandy Buses/digana_2022_10.csv")

In [3]:
real_time_data.shape

(978917, 6)

In [4]:
real_time_data.isnull().sum()

id            0
deviceid      0
devicetime    0
latitude      0
longitude     0
speed         0
dtype: int64

In [5]:
real_time_data.head()

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed
0,1453607164,116,2022-09-30 23:56:32,8.222097,80.522697,0.0
1,1453607165,116,2022-09-30 23:56:47,8.222097,80.522697,0.0
2,1453607166,116,2022-09-30 23:57:02,8.222097,80.522697,0.0
3,1453607167,116,2022-09-30 23:57:13,8.221493,80.522667,16.1987
4,1453607168,116,2022-09-30 23:57:28,8.219942,80.522738,24.2981


# Data Preprocessing

In [6]:
real_time_data.sort_values(by='devicetime', inplace=True)
real_time_data = real_time_data[real_time_data['latitude'] != 0]
real_time_data = real_time_data[real_time_data['longitude'] != 0]
real_time_data = real_time_data[real_time_data['devicetime'] >=  '2022-10-01']
real_time_data = real_time_data[real_time_data['devicetime'] <= '2022-10-31']
real_time_data.drop(columns=['id'], inplace=True)
real_time_data.reset_index(drop = True, inplace = True)

In [7]:
real_time_data.head()

Unnamed: 0,deviceid,devicetime,latitude,longitude,speed
0,116,2022-10-01 00:00:13,8.195475,80.525735,26.9979
1,116,2022-10-01 00:00:28,8.193405,80.525917,30.2376
2,116,2022-10-01 00:00:43,8.191373,80.52689,31.8575
3,116,2022-10-01 00:00:58,8.189095,80.5274,34.5572
4,116,2022-10-01 00:01:13,8.186705,80.527848,35.0972


In [8]:
#Function finds whether the bus is inside the bus stations or not
def isOutsideBusStation(lat, lon):
    
    lat = math.radians(lat)
    lon = math.radians(lon)

    lat_Kandy = math.radians(7.292462226)
    lon_Kandy = math.radians(80.6349778)

    lat_Digana = math.radians(7.29896)
    lon_Digana = math.radians(80.73472)

    delta_lat1 = lat_Kandy - lat
    delta_lon1 = lon_Kandy - lon

    delta_lat2 = lat_Digana - lat
    delta_lon2 = lon_Digana - lon

    # Haversine formula
    a1 = math.sin(delta_lat1/2)**2 + math.cos(lat) * math.cos(lat_Kandy) * math.sin(delta_lon1/2)**2
    c1 = 2 * math.atan2(math.sqrt(a1), math.sqrt(1-a1))

    a2 = math.sin(delta_lat2/2)**2 + math.cos(lat) * math.cos(lat_Digana) * math.sin(delta_lon2/2)**2
    c2 = 2 * math.atan2(math.sqrt(a2), math.sqrt(1-a2))

    # Radius of the Earth in kilometers (mean value)
    R = 6371.0

    # Calculate the distance
    distance1 = R * c1 * 1000
    distance2 = R * c2 * 1000

    if distance1 < 150 or distance2 < 150:
        return False
    return True

In [9]:
processed_data = real_time_data.copy()
#processed_data['startTime'] = None

In [10]:
busStartTime = dict()

In [11]:
#Function to find start time of each trip
def findBusStartTime(row):
    global busStartTime
 
    if isOutsideBusStation(row['latitude'],row['longitude']):
        if busStartTime.get(row['deviceid']) is not None:
            return busStartTime.get(row['deviceid'])
        else:
            busStartTime[row['deviceid']]=row['devicetime']
            return row['devicetime']

    if row['deviceid'] in busStartTime:
        busStartTime.pop(row['deviceid'])
    return None 

In [12]:
processed_data['startTime'] = processed_data.apply(findBusStartTime,axis=1)

In [13]:
processed_data.head()

Unnamed: 0,deviceid,devicetime,latitude,longitude,speed,startTime
0,116,2022-10-01 00:00:13,8.195475,80.525735,26.9979,2022-10-01 00:00:13
1,116,2022-10-01 00:00:28,8.193405,80.525917,30.2376,2022-10-01 00:00:13
2,116,2022-10-01 00:00:43,8.191373,80.52689,31.8575,2022-10-01 00:00:13
3,116,2022-10-01 00:00:58,8.189095,80.5274,34.5572,2022-10-01 00:00:13
4,116,2022-10-01 00:01:13,8.186705,80.527848,35.0972,2022-10-01 00:00:13


In [14]:
def timeDifference(t2,t1):

    format_str = "%Y-%m-%d %H:%M:%S"
    t1_datetime = datetime.strptime(t1, format_str)
    t2_datetime = datetime.strptime(t2, format_str)

    timediff = ((t2_datetime - t1_datetime).total_seconds())
    return round((timediff/60),2)

In [15]:
busDwellTime = dict()
previousTimeStamp = dict()

In [16]:
def findDwellTime(row):
    global busDwellTime
    global previousTimeStamp
    
    if isOutsideBusStation(row['latitude'],row['longitude']):
        if row['speed'] == 0:

            if busDwellTime.get(row['deviceid']) is not None:
                busDwellTime[row['deviceid']]=busDwellTime[row['deviceid']]+timeDifference(row['devicetime'],previousTimeStamp[row['deviceid']])

            elif previousTimeStamp.get(row['deviceid']) is not None:
                busDwellTime[row['deviceid']]=timeDifference(row['devicetime'],previousTimeStamp[row['deviceid']])
                
            else:
                busDwellTime[row['deviceid']]=0

        elif busDwellTime.get(row['deviceid']) is None:
            busDwellTime[row['deviceid']]=0
    
        previousTimeStamp[row['deviceid']] = row['devicetime']
        return busDwellTime[row['deviceid']]

    else:
        if row['deviceid'] in busDwellTime:
            busDwellTime.pop(row['deviceid'])
        if row['deviceid'] in previousTimeStamp:
            previousTimeStamp.pop(row['deviceid'])
        return None

In [17]:
processed_data['dwellTime'] = processed_data.apply(findDwellTime,axis=1)

In [18]:
processed_data.head()

Unnamed: 0,deviceid,devicetime,latitude,longitude,speed,startTime,dwellTime
0,116,2022-10-01 00:00:13,8.195475,80.525735,26.9979,2022-10-01 00:00:13,0.0
1,116,2022-10-01 00:00:28,8.193405,80.525917,30.2376,2022-10-01 00:00:13,0.0
2,116,2022-10-01 00:00:43,8.191373,80.52689,31.8575,2022-10-01 00:00:13,0.0
3,116,2022-10-01 00:00:58,8.189095,80.5274,34.5572,2022-10-01 00:00:13,0.0
4,116,2022-10-01 00:01:13,8.186705,80.527848,35.0972,2022-10-01 00:00:13,0.0


In [19]:
def findTravelTime(row):
    if row['startTime'] is not None:
        return timeDifference(row['devicetime'],row['startTime'])
    return None

In [20]:
processed_data['travelTime'] = processed_data.apply(findTravelTime,axis=1)

In [21]:
processed_data.head()

Unnamed: 0,deviceid,devicetime,latitude,longitude,speed,startTime,dwellTime,travelTime
0,116,2022-10-01 00:00:13,8.195475,80.525735,26.9979,2022-10-01 00:00:13,0.0,0.0
1,116,2022-10-01 00:00:28,8.193405,80.525917,30.2376,2022-10-01 00:00:13,0.0,0.25
2,116,2022-10-01 00:00:43,8.191373,80.52689,31.8575,2022-10-01 00:00:13,0.0,0.5
3,116,2022-10-01 00:00:58,8.189095,80.5274,34.5572,2022-10-01 00:00:13,0.0,0.75
4,116,2022-10-01 00:01:13,8.186705,80.527848,35.0972,2022-10-01 00:00:13,0.0,1.0


In [22]:
def findSITR(row):
    if row['travelTime'] is not None and row['travelTime']!=0:
        return round((row['dwellTime']/row['travelTime']),3)
    return 0

In [23]:
processed_data["SITR"] = processed_data.apply(findSITR,axis=1)

In [24]:
processed_data = processed_data[(processed_data['SITR']<=1) | (pd.isnull(processed_data['SITR']))]
processed_data = processed_data[(processed_data['travelTime']<=90) | (pd.isnull(processed_data['SITR']))]
processed_data = processed_data[(processed_data['dwellTime']<=30) | (pd.isnull(processed_data['SITR']))]

In [26]:
processed_data.isnull().sum()

deviceid          0
devicetime        0
latitude          0
longitude         0
speed             0
startTime     89520
dwellTime     89520
travelTime    89520
SITR          89520
dtype: int64

In [27]:
processed_data.to_csv('processed_data.csv',index=False)