# **Real Time Data Processing**

In [70]:
#import reqired libraries
import math
import pandas as pd
import numpy as np
from datetime import datetime

In [71]:
real_time_data = pd.read_csv("../Raw GPS data Kandy Buses/digana_2022_10.csv")

In [72]:
real_time_data.shape

(978917, 6)

In [73]:
real_time_data.isnull().sum()

id            0
deviceid      0
devicetime    0
latitude      0
longitude     0
speed         0
dtype: int64

In [74]:
real_time_data.head()

Unnamed: 0,id,deviceid,devicetime,latitude,longitude,speed
0,1453607164,116,2022-09-30 23:56:32,8.222097,80.522697,0.0
1,1453607165,116,2022-09-30 23:56:47,8.222097,80.522697,0.0
2,1453607166,116,2022-09-30 23:57:02,8.222097,80.522697,0.0
3,1453607167,116,2022-09-30 23:57:13,8.221493,80.522667,16.1987
4,1453607168,116,2022-09-30 23:57:28,8.219942,80.522738,24.2981


# Data Preprocessing

In [75]:
real_time_data.sort_values(by='devicetime', inplace=True)
real_time_data = real_time_data[real_time_data['latitude'] != 0]
real_time_data = real_time_data[real_time_data['longitude'] != 0]
real_time_data = real_time_data[real_time_data['devicetime'] >=  '2022-10-01']
real_time_data = real_time_data[real_time_data['devicetime'] <= '2022-10-07']
real_time_data.drop(columns=['id'], inplace=True)
real_time_data.reset_index(drop = True, inplace = True)

In [76]:
real_time_data.head()

Unnamed: 0,deviceid,devicetime,latitude,longitude,speed
0,116,2022-10-01 00:00:13,8.195475,80.525735,26.9979
1,116,2022-10-01 00:00:28,8.193405,80.525917,30.2376
2,116,2022-10-01 00:00:43,8.191373,80.52689,31.8575
3,116,2022-10-01 00:00:58,8.189095,80.5274,34.5572
4,116,2022-10-01 00:01:13,8.186705,80.527848,35.0972


In [77]:
# #Function finds whether the bus is inside the bus stations or not
# def isOutsideBusStation(lat, lon):
    
#     lat = math.radians(lat)
#     lon = math.radians(lon)

#     lat_Kandy = math.radians(7.292462226)
#     lon_Kandy = math.radians(80.6349778)

#     lat_Digana = math.radians(7.29896)
#     lon_Digana = math.radians(80.73472)

#     delta_lat1 = lat_Kandy - lat
#     delta_lon1 = lon_Kandy - lon

#     delta_lat2 = lat_Digana - lat
#     delta_lon2 = lon_Digana - lon

#     # Haversine formula
#     a1 = math.sin(delta_lat1/2)**2 + math.cos(lat) * math.cos(lat_Kandy) * math.sin(delta_lon1/2)**2
#     c1 = 2 * math.atan2(math.sqrt(a1), math.sqrt(1-a1))

#     a2 = math.sin(delta_lat2/2)**2 + math.cos(lat) * math.cos(lat_Digana) * math.sin(delta_lon2/2)**2
#     c2 = 2 * math.atan2(math.sqrt(a2), math.sqrt(1-a2))

#     # Radius of the Earth in kilometers (mean value)
#     R = 6371.0

#     # Calculate the distance
#     distance1 = R * c1 * 1000
#     distance2 = R * c2 * 1000

#     if distance1 < 150 or distance2 < 150:
#         return False
#     return True

In [78]:
processed_data = real_time_data.copy()

In [79]:
# Convert devicetime column to datetime format
processed_data['devicetime'] = pd.to_datetime(processed_data['devicetime'])

processed_data['date'] = processed_data['devicetime'].dt.date
processed_data['time'] = processed_data['devicetime'].dt.time
processed_data = processed_data.drop("devicetime", axis=1)
processed_data.head()

Unnamed: 0,deviceid,latitude,longitude,speed,date,time
0,116,8.195475,80.525735,26.9979,2022-10-01,00:00:13
1,116,8.193405,80.525917,30.2376,2022-10-01,00:00:28
2,116,8.191373,80.52689,31.8575,2022-10-01,00:00:43
3,116,8.189095,80.5274,34.5572,2022-10-01,00:00:58
4,116,8.186705,80.527848,35.0972,2022-10-01,00:01:13


In [80]:
trip_data = pd.read_csv("./ALL_BUS/trip_all.csv")

In [81]:
trip_data = trip_data[(trip_data['date'] <= '2022-10-07') & (trip_data['date'] >= '2022-10-01')]

In [82]:
trip_data = trip_data[["device_id","date","start_time","start_terminal","end_time"]]

In [83]:
trip_data.head()

Unnamed: 0,device_id,date,start_time,start_terminal,end_time
1666,116,2022-10-01,08:41:39,BT02,09:35:36
1667,116,2022-10-01,09:51:51,BT01,10:42:47
1668,116,2022-10-01,13:42:11,BT02,14:30:32
1669,116,2022-10-01,15:09:08,BT01,16:00:26
1670,116,2022-10-02,07:42:11,BT02,08:22:32


In [84]:
# Preprocess 'date' and 'time' columns to ensure consistent format
trip_data['date'] = pd.to_datetime(trip_data['date'], errors='coerce', format='%Y-%m-%d')
trip_data['start_time'] = pd.to_datetime(trip_data['start_time'], errors='coerce', format='%H:%M:%S').dt.time
trip_data['end_time'] = pd.to_datetime(trip_data['end_time'], errors='coerce', format='%H:%M:%S').dt.time

processed_data['date'] = pd.to_datetime(processed_data['date'], errors='coerce', format='%Y-%m-%d')
processed_data['time'] = pd.to_datetime(processed_data['time'], errors='coerce', format='%H:%M:%S').dt.time


# Function to find start_time and start_terminal
def find_start_info(row):
    matching_rows = trip_data[(trip_data['device_id'] == row['deviceid']) & 
                        (trip_data['date'] == row['date']) & 
                        (row['time'] >= trip_data['start_time']) & 
                        (row['time'] <= trip_data['end_time'])]
    if not matching_rows.empty:
        return matching_rows.iloc[0]['start_time'], matching_rows.iloc[0]['start_terminal']
    else:
        return None, None

# Apply the function to the first dataset to find start_time and start_terminal
processed_data[['start_time', 'start_terminal']] = processed_data.apply(find_start_info, axis=1, result_type='expand')

# Print the updated DataFrame
print(processed_data)

        deviceid  latitude  longitude    speed       date      time  \
0            116  8.195475  80.525735  26.9979 2022-10-01  00:00:13   
1            116  8.193405  80.525917  30.2376 2022-10-01  00:00:28   
2            116  8.191373  80.526890  31.8575 2022-10-01  00:00:43   
3            116  8.189095  80.527400  34.5572 2022-10-01  00:00:58   
4            116  8.186705  80.527848  35.0972 2022-10-01  00:01:13   
...          ...       ...        ...      ...        ...       ...   
127994       123  7.296305  80.736062   0.0000 2022-10-06  23:53:59   
127995       250  7.283670  80.695938   0.0000 2022-10-06  23:55:06   
127996       121  7.296612  80.715932   0.0000 2022-10-06  23:56:37   
127997       275  7.292808  80.721027   0.0000 2022-10-06  23:58:41   
127998      1377  7.263868  80.700605   0.0000 2022-10-06  23:59:37   

       start_time start_terminal  
0            None           None  
1            None           None  
2            None           None  
3      

In [85]:
busStartTime = dict()

In [86]:
# #Function to find start time of each trip
# def findBusStartTime(row):
#     global busStartTime
 
#     if isOutsideBusStation(row['latitude'],row['longitude']):
#         if busStartTime.get(row['deviceid']) is not None:
#             return busStartTime.get(row['deviceid'])
#         else:
#             busStartTime[row['deviceid']]=row['devicetime']
#             return row['devicetime']

#     if row['deviceid'] in busStartTime:
#         busStartTime.pop(row['deviceid'])
#     return None 

In [87]:
processed_data.head()

Unnamed: 0,deviceid,latitude,longitude,speed,date,time,start_time,start_terminal
0,116,8.195475,80.525735,26.9979,2022-10-01,00:00:13,,
1,116,8.193405,80.525917,30.2376,2022-10-01,00:00:28,,
2,116,8.191373,80.52689,31.8575,2022-10-01,00:00:43,,
3,116,8.189095,80.5274,34.5572,2022-10-01,00:00:58,,
4,116,8.186705,80.527848,35.0972,2022-10-01,00:01:13,,


In [88]:
def timeDifference(t2,t1):

    # format_str = "%Y-%m-%d %H:%M:%S"
    # t1_datetime = datetime.strptime(t1, format_str)
    # t2_datetime = datetime.strptime(t2, format_str)

    # timediff = ((t2_datetime - t1_datetime).total_seconds())

    date = datetime(2022, 1, 1)
    datetime1 = datetime.combine(date, t1)
    datetime2 = datetime.combine(date, t2)

    # Calculate the time difference in seconds
    time_difference_seconds = (datetime2 - datetime1).seconds

    return round((time_difference_seconds/60),2)

In [89]:
def findTravelTime(row):
    if row['start_time'] is not None:
        return timeDifference(row['time'],row['start_time'])
    return None

In [90]:
processed_data['travel_time'] = processed_data.apply(findTravelTime,axis=1)

In [91]:
busDwellTime = dict()
previousTimeStamp = dict()

In [92]:
def findDwellTime(row):
    global busDwellTime
    global previousTimeStamp
    
    if row['start_time'] is not None:
        if row['speed'] == 0:

            if busDwellTime.get(row['deviceid']) is not None:
                busDwellTime[row['deviceid']]=busDwellTime[row['deviceid']]+timeDifference(row['time'],previousTimeStamp[row['deviceid']])

            elif previousTimeStamp.get(row['deviceid']) is not None:
                busDwellTime[row['deviceid']]=timeDifference(row['time'],previousTimeStamp[row['deviceid']])
                
            else:
                busDwellTime[row['deviceid']]=0

        elif busDwellTime.get(row['deviceid']) is None:
            busDwellTime[row['deviceid']]=0
    
        previousTimeStamp[row['deviceid']] = row['time']
        return busDwellTime[row['deviceid']]

    else:
        if row['deviceid'] in busDwellTime:
            busDwellTime.pop(row['deviceid'])
        if row['deviceid'] in previousTimeStamp:
            previousTimeStamp.pop(row['deviceid'])
        return None

In [93]:
processed_data['dwell_time'] = processed_data.apply(findDwellTime,axis=1)

In [96]:
def findSITR(row):
    if row['travel_time'] is not None and row['travel_time']!=0:
        return round((row['dwell_time']/row['travel_time']),3)
    return 0

In [97]:
processed_data["SITR"] = processed_data.apply(findSITR,axis=1)

In [101]:
processed_data['hour_of_the_day'] = processed_data['time'].apply(lambda x: x.hour)

In [98]:
processed_data = processed_data[(processed_data['SITR']<=1) | (pd.isnull(processed_data['SITR']))]
processed_data = processed_data[(processed_data['travel_time']<=90) | (pd.isnull(processed_data['SITR']))]
processed_data = processed_data[(processed_data['dwell_time']<=30) | (pd.isnull(processed_data['SITR']))]

In [99]:
processed_data.isnull().sum()

deviceid              0
latitude              0
longitude             0
speed                 0
date                  0
time                  0
start_time        53110
start_terminal    53110
travel_time       53110
dwell_time        53110
SITR              53110
dtype: int64

In [100]:
processed_data.to_csv('processed_data.csv',index=False)

In [102]:
p = processed_data.copy()
p = p[pd.notna(p["start_time"])]
p.head()

Unnamed: 0,deviceid,latitude,longitude,speed,date,time,start_time,start_terminal,travel_time,dwell_time,SITR,hour_of_the_day
904,123,7.297508,80.732563,9.71923,2022-10-01,04:03:26,04:03:26,BT02,0.0,0.0,0.0,4
906,123,7.297348,80.731287,20.5184,2022-10-01,04:03:41,04:03:26,BT02,0.25,0.0,0.0,4
909,123,7.296578,80.730488,15.6588,2022-10-01,04:03:56,04:03:26,BT02,0.5,0.0,0.0,4
910,123,7.29558,80.729657,18.3585,2022-10-01,04:04:11,04:03:26,BT02,0.75,0.0,0.0,4
911,123,7.29477,80.728355,21.0583,2022-10-01,04:04:26,04:03:26,BT02,1.0,0.0,0.0,4
