In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from shapely.geometry import Point
from shapely.geometry import LineString
from shapely import wkt
from datetime import datetime, timedelta
# from Data_Preprocessing.Bus_Routes.generate_routes import transform_coordinates

pd.set_option('display.max_columns', 500)

UNASSIGNED = -1
MIN_NUM_OF_PINGS = 5 # before that we can't try to conclude on a drive or close the drive
MIN_VALID_PINGS = 5 # minimum number of valid pings to assign a drive
MAX_PINGS_ERROR = 50 # max allowed avg distance from route-linestring (meters)
MAX_SINGLE_PING_ERROR = 100 # max distance to line a ping can have for it to be considered as valid.
MAX_TIME_BETWEEN_PINGS = 300 # seconds
MAX_UNASSIGNED_PINGS = 50
MAX_DISTANCE_TO_END_STOP = 50 # meters

In [2]:
# def adjust_stream_data(stream):
#     stream["timestamp"] = pd.to_datetime(stream["timestamp"]).apply(lambda elem:elem.replace(second=0))
#     stream["point"] = stream.apply(
#         lambda row: transform_coordinates(row["longitude"],row["latitude"]),axis=1)
#     return stream

stream = pd.read_csv("..//Data//Samples//3days3lines_adjusted_small.csv",dtype={"lineId":str,"vehicleId":str})
stream["timestamp"] = pd.to_datetime(stream["timestamp"])
stream["round_timestamp"] = stream["timestamp"].apply(lambda elem:elem.replace(second=0))
stream["point"] = stream["point"].apply(lambda elem : wkt.loads(elem))
# stream.sort_values(by=["timestamp"],ascending=True)
stream

Unnamed: 0,longitude,latitude,lineId,timestamp,journeyPatternId,vehicleId,point,round_timestamp
0,-6.367692,53.353845,25,2017-07-03 10:56:37,025B0002,33401,POINT (708666.7984612143 734978.8023896469),2017-07-03 10:56:00
1,-6.260080,53.346720,25,2017-07-03 10:56:40,025A0002,33352,POINT (715849.5037407882 734355.274681642),2017-07-03 10:56:00
2,-6.295959,53.325905,150,2017-07-03 10:56:40,01501001,33493,POINT (713516.0332653577 731981.8164871124),2017-07-03 10:56:00
3,-6.316828,53.316637,150,2017-07-03 10:56:50,01500001,33518,POINT (712150.2444403638 730917.7067627693),2017-07-03 10:56:00
4,-6.256600,53.348150,25,2017-07-03 10:56:53,025B1002,33364,POINT (716077.3078691751 734520.0255593333),2017-07-03 10:56:00
...,...,...,...,...,...,...,...,...
121810,-6.265445,53.345667,25,2017-07-06 12:37:08,00250002,33607,POINT (715495.1622603569 734229.4286031137),2017-07-06 12:37:00
121811,-6.432953,53.346075,25,2017-07-06 12:37:08,025A0002,33363,POINT (704341.5031536606 734016.9641608861),2017-07-06 12:37:00
121812,-6.255930,53.347284,150,2017-07-06 12:37:09,01500001,33495,POINT (716124.2668647266 734424.7655766791),2017-07-06 12:37:00
121813,-6.360542,53.353103,25,2017-07-06 12:37:11,025B1002,33405,POINT (709144.6503379249 734907.1537602763),2017-07-06 12:37:00


In [5]:
class Route:
# Represents a general route of a bus line
    def __init__(self,route_df):
        self.direction_ = route_df.iloc[0]["direction_id"]
        self.routeId_ = route_df.iloc[0]["route_id"]
        self.line_ = route_df.iloc[0]["route_short_name"]
        self.route_ = route_df[["stop_sequence","stop_id","stop_point","dist_traveled"]]
        self.linestring_ = LineString(list(self.route_["stop_point"]))
        self.stops_dist_from_beg_ = [self.linestring_.project(Point(coords)) for coords in self.linestring_.coords]
#         print(self.line_,self.direction_)
    
    def getProjectedPoints(self,points):
        proj_points = defaultdict(lambda :list())
        for point in points:
            proj_points["dist_to_line"].append(self.linestring_.distance(point))
            proj_points["dist_traveled"].append(self.linestring_.project(point))
            proj_points["proj_point"].append(self.linestring_.interpolate(self.linestring_.project(point)))
        return proj_points
    
    # Score - the lower the better
    def getDriveRouteScore(self,points,last_valid_ping):
        proj_points = self.getProjectedPoints(points)
        valid_signal = self.getValidPings(points,last_valid_ping["point"])
        if sum(valid_signal) < MIN_VALID_PINGS:
            return np.inf, None
        score = np.array(proj_points["dist_to_line"]).mean()
        return score, valid_signal
    
    def getValidPings(self,points,last_valid_point):
        proj_points = self.getProjectedPoints(points)
        last_valid_ping_proj_dist_traveled = self.getProjectedPoints([last_valid_point])["dist_traveled"][0]
#         print("init point: ",last_valid_ping_proj_dist_traveled)
        valid_signal = []
        for i in range(len(points)):
            if proj_points["dist_traveled"][i] >= last_valid_ping_proj_dist_traveled and proj_points["dist_to_line"][i] < MAX_SINGLE_PING_ERROR:
                valid_signal.append(1)
                last_valid_ping_proj_dist_traveled = proj_points["dist_traveled"][i]
            else:
                valid_signal.append(0)
#             print(proj_points["dist_traveled"][i],proj_points["dist_to_line"][i],valid_signal[-1])
        return valid_signal
        
    def getNextStop(self,point):
        proj_point = self.getProjectedPoints([point])
        return np.searchsorted(self.stops_dist_from_beg_, proj_point["dist_traveled"][0], side='right') + 1
    
    def isFinalStop(self,point):
        next_stop_num = self.getNextStop(point)
        # redundant check
        if next_stop_num < 1 or next_stop_num > len(self.linestring_.coords) + 1:
            print(next_stop_num,len(self.linestring_.coords))
            raise Exception("Stop number doesn't exists: {}.".format(next_stop_num))
        return next_stop_num == len(self.linestring_.coords) or next_stop_num == len(self.linestring_.coords) + 1 or point.distance(Point(self.linestring_.coords[-1])) < MAX_DISTANCE_TO_END_STOP
        
    
    
class Routes:
    def __init__(self,routes_filename):
        df_trips = pd.read_csv(routes_filename)
        df_trips["stop_point"] = df_trips["stop_point"].apply(lambda elem : wkt.loads(elem))
        self.routes_ = defaultdict(lambda :list()) # key: line, value: list of routes, on per direction.
        # each route has a single line and 1 or 2 direction.
        for attrs, trip_df in df_trips.groupby(["route_id","route_short_name","direction_id"]):
            (route_id,line,direction) = attrs
            self.routes_[line].append(Route(trip_df))
        print(list(self.routes_.keys()))
        
    def __getitem__(self, line):
        return self.routes_[line]
        

class BusDrive:
    num_full_drive = 1
    df_full_drives = pd.DataFrame(columns=["sample_number","route_id","ping_number","timestamp","dist_traveled","dist_to_line"])
# Index(['index', 'journeyPatternId', 'latitude', 'lineId', 'longitude', 'point',
#        'round_timestamp', 'timestamp', 'valid_ping', 'vehicleId',
#        'sample_number']
    def __init__(self,routes,pings_df,save_full_drive):
        self.routes_ = routes
        self.line_ = pings_df.iloc[0]["lineId"]
        self.vehicle_ = pings_df.iloc[0]["vehicleId"]
        self.pings_df_ = pd.DataFrame(columns=pings_df.columns)
        self.route_ = UNASSIGNED
        self.route_score_ = UNASSIGNED
        self.last_ping_ = None
        self.last_valid_ping_ = None
        self.addPings(pings_df)
        self.save_full_drive_ = save_full_drive
        
    def __len__(self):
        return self.pings_df_.shape[0]
    
    def lastActivity(self):
        return self.last_ping_["timestamp"] 
    
#     def lastValidActivity(self):
#         return self.last_ping_["timestamp"] 
    
    def addPings(self,pings_df):
        if not pings_df[(pings_df["lineId"]!=self.line_)|(pings_df["vehicleId"]!=self.vehicle_)].empty:
            raise Exception("""Bus drive {}, {} got wrong pings.
                            {}""".format(self.line_,self.vehicle_,pings_df[(pings_df["lineId"]!=self.line_)|(pings_df["vehicleId"]!=self.vehicle_)]))
        
        pings_df.sort_values(by=["timestamp"],ascending=True,inplace=True)
        pings_df.reset_index(inplace=True)
        
        if self.last_valid_ping_ is None or self.last_ping_ is None: # just for the first time
            self.last_valid_ping_ = pings_df.iloc[0] # first ping
            self.last_ping_ = pings_df.iloc[pings_df.shape[0]-1] # last ping
            
        pings_df = pings_df[pings_df["timestamp"] >= self.last_ping_["timestamp"]] # filter all the pings who was before the last ping
        self.last_ping_ = pings_df.iloc[pings_df.shape[0]-1] # last ping
        
        if self.route_ == UNASSIGNED:
            self.pings_df_ = self.pings_df_.append(pings_df)
            self.identifyPath()
        else:
            pings_df["valid_ping"] = self.route_.getValidPings(pings_df["point"].values,self.last_valid_ping_["point"])
            self.last_valid_ping_ = self.getLastValidPing(pings_df)
            self.pings_df_ = self.pings_df_.append(pings_df)
            
    def getLastValidPing(self,pings_df):
        temp = pings_df[pings_df["valid_ping"]==1]
        if temp.empty:
            return self.last_valid_ping_
        return temp.iloc[-1]
    
        
    def identifyPath(self):
        if self.pings_df_.shape[0] < MIN_NUM_OF_PINGS:
            return
#         print("identifyPath: ",self.line_)
        best_route_score = np.inf
        best_route = None
        valid_pings = None
        for route in self.routes_[self.line_]:
            score, route_valid_pings = route.getDriveRouteScore(self.pings_df_["point"].values,self.last_valid_ping_)
            if score < best_route_score:
                best_route_score = score
                best_route = route
                valid_pings = route_valid_pings
                
        if best_route_score <= MAX_PINGS_ERROR:
            self.route_ = best_route
            self.route_score_ = best_route_score
            self.pings_df_["valid_ping"] = valid_pings
            self.last_valid_ping_ = self.getLastValidPing(self.pings_df_)
            
    def isDriveEnded(self,curr_time):
        if curr_time - self.lastActivity() > timedelta(seconds=MAX_TIME_BETWEEN_PINGS): # Didn't received a ping for a long time
            print("Drive Ended - No Activity: ",(self.line_,self.vehicle_))
            return True
        if self.route_ == UNASSIGNED and self.pings_df_.shape[0] > MAX_UNASSIGNED_PINGS:
            print("Drive Ended - UNASSIGNED: ",(self.line_,self.vehicle_))
            return True
        if self.route_ == UNASSIGNED:
            return False
        if self.route_.isFinalStop(self.last_ping_["point"]):
            valid_pings_df = self.pings_df_[self.pings_df_["valid_ping"]==1]
            print("Drive Ended - Final Stop: ",(self.line_,self.vehicle_)," ",len(valid_pings_df))
            if self.save_full_drive_:
                BusDrive.add_drive(valid_pings_df,self.route_.routeId_,self.route_.getProjectedPoints(valid_pings_df["point"]))

            return True
        return False
    
    @staticmethod
    def add_drive(drive_valid_pings,route_id,proj_points):
        if len(drive_valid_pings) < 100:
            return
        drive_valid_pings["sample_number"] = BusDrive.num_full_drive
        drive_valid_pings["route_id"] = route_id
        drive_valid_pings['ping_number'] = np.arange(len(drive_valid_pings))
        drive_valid_pings["dist_traveled"] = proj_points["dist_traveled"]
        drive_valid_pings["dist_to_line"] = proj_points["dist_to_line"]
        BusDrive.df_full_drives = BusDrive.df_full_drives.append(drive_valid_pings[["sample_number","route_id","ping_number","timestamp","dist_traveled","dist_to_line"]])
        BusDrive.num_full_drive += 1
    
    def getRouteScore(self):
        return self.route_score_
    
    def isAssignedRoute(self):
        return self.route_ != UNASSIGNED
    
    def numValid(self):
        if self.route_ == UNASSIGNED:
            return None
        return self.pings_df_[self.pings_df_["valid_ping"]==1].shape[0]

class RTS:
    def __init__(self,routes_filename,save_full_drives = False):
        # Loading routes to identify and monitor
        self.routes_ = Routes(routes_filename)
        self.active_drives_ = {}
        self.system_time_ = datetime(2017, 1, 1, 0, 0) # min system time
        self.save_full_drives_ = save_full_drives
    
    def recieveDataStream(self,pings_df):
        self.system_time_ = max(self.system_time_,max(pings_df["timestamp"]))
        for attrs, drive_pings_df in pings_df.groupby(["lineId","vehicleId"]):
            (line_id,vehicle_id) = attrs
            if (line_id,vehicle_id) in self.active_drives_.keys():
                self.active_drives_[(line_id,vehicle_id)].addPings(drive_pings_df)
            else:
                self.active_drives_[(line_id,vehicle_id)] = BusDrive(self.routes_,drive_pings_df,self.save_full_drives_)
        keys_to_remove = []
        for drive_key, drive in self.active_drives_.items():
            if drive.isDriveEnded(self.system_time_):
                keys_to_remove.append(drive_key)
        self.removeDrives(keys_to_remove)
        self.validateVehicleSingularity()
        self.printSystemState()
                
    def validateVehicleSingularity(self):
        vehicle_dic = defaultdict(lambda:[])
        for (line_id,vehicle_id) in self.active_drives_.keys():
            vehicle_dic[vehicle_id].append(line_id)
        for vehicle_id, line_list in vehicle_dic.items():
            if len(line_list) > 1:
                self.removeDuplicateVehiclesDrives(vehicle_id,line_list)
                print("New Drive Started: ",(vehicle_id,line_list))
    
    def removeDuplicateVehiclesDrives(self,vehicle_id,line_list):
        keys_to_remove = []
        last_activity_time = -np.inf
        last_activity_key_drive = None
        for line_id in line_list:
            drive = self.active_drives_[(vehicle_id,line_id)]
            if len(drive) < MIN_NUM_OF_PINGS: # allow for small drives to stay for now 
                continue
            # keep only the most updated drive
            if last_activity_time < drive.lastActivity():
                keys_to_remove.append(last_activity_key_drive) if last_activity_key_drive!=None else None
                last_activity_time = drive.lastActivity()
                last_activity_key_drive = (vehicle_id,line_id)
        self.removeDrives(keys_to_remove)
    
    def removeDrives(self,drive_keys):
        for key in drive_keys:
            del self.active_drives_[key]
            print("Deleted drive: ",key)
    
    def printSystemState(self):
        stats_dic = defaultdict(lambda:[])
        for attrs in sorted(self.active_drives_.keys()):
            drive = self.active_drives_[attrs]
            stats_dic["drive"].append(attrs)
            stats_dic["num_pings"].append(len(drive))
            stats_dic["num_valid_pings"].append(drive.numValid())
            stats_dic["route_score"].append(drive.getRouteScore())
            stats_dic["found_route"].append(drive.isAssignedRoute())
#         print(pd.DataFrame.from_dict(stats_dic))
                

In [6]:
rts = RTS("..//Data_Preprocessing//Bus_Routes//clean_routes.csv")
df_lists = list(stream.groupby(["round_timestamp"]))
df_lists.sort(key=lambda elem:elem[0])
for i, (time,group_df) in enumerate(df_lists):
    print("index ", i)
#     print("System time: ",rts.system_time_)
    rts.recieveDataStream(group_df)
#     print()
    if i == 60*1:
        break
BusDrive.df_full_drives.reset_index().to_csv("drives_sample_V4.csv",index=False)
print("FINISH")

['1', '11', '116', '118', '122', '123', '130', '14', '140', '142', '14c', '150', '155', '15a', '15b', '15d', '16', '16c', '16d', '25', '25a', '25b', '25d', '25x', '26', '27a', '27x', '29a', '31', '31a', '31b', '31d', '32', '32x', '33', '33b', '33d', '33e', '33x', '37', '38a', '38b', '38d', '39', '39x', '4', '40b', '40e', '41', '41b', '41c', '41d', '42', '42d', '43', '44', '44b', '46e', '49', '51d', '51x', '53', '53a', '54a', '56a', '61', '65b', '66', '66a', '66b', '66e', '67', '68a', '68x', '69', '69x', '7', '70', '70d', '747', '757', '77a', '77x', '79', '79a', '7a', '7b', '7d', '83', '83a', '84a', '9']
index  0
index  1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


index  2
index  3
index  4
index  5
index  6
index  7
index  8
Drive Ended - Final Stop:  ('25', '38056')   20
Deleted drive:  ('25', '38056')
index  9
index  10
Drive Ended - Final Stop:  ('150', '33518')   27
Deleted drive:  ('150', '33518')
index  11
Drive Ended - Final Stop:  ('25', '38056')   5
Deleted drive:  ('25', '38056')
index  12
index  13
index  14
index  15
Drive Ended - No Activity:  ('25', '33406')
Drive Ended - Final Stop:  ('25', '33533')   5
Deleted drive:  ('25', '33406')
Deleted drive:  ('25', '33533')
index  16
index  17
Drive Ended - Final Stop:  ('25', '33401')   16
Drive Ended - No Activity:  ('25', '38056')
Deleted drive:  ('25', '33401')
Deleted drive:  ('25', '38056')
index  18
Drive Ended - UNASSIGNED:  ('150', '33493')
Drive Ended - UNASSIGNED:  ('25', '33358')
Drive Ended - UNASSIGNED:  ('25', '33399')
Drive Ended - Final Stop:  ('25', '33533')   6
Deleted drive:  ('150', '33493')
Deleted drive:  ('25', '33358')
Deleted drive:  ('25', '33399')
Deleted driv

In [5]:
# # BusDrive.df_full_drives.reset_index().to_csv("full_drive_sample.csv",index=False)
# df = pd.read_csv("full_drive_sample.csv")
# for attrs, drive_df in df.groupby(["route_id", "sample_number"]):
#     print(attrs)
#     print(drive_df)
#     break
# # c = Counter()
# # for time,group_df in stream.groupby(["timestamp","lineId","vehicleId"]):
# #     c[str(group_df.shape[0])] += 1
# # print(len(c))
# # print(c.most_common(100))

In [None]:
# df = pd.read_csv("..//Data//Samples//3days3lines.csv")
# df = df.sort_values(["lineId","vehicleId","timestamp"])
# df["driveId"] = df.apply(lambda row : str(row["lineId"])+"_"+str(row["vehicleId"]),axis=1)
# df.to_csv("..//Drafts//3days3lines_bydrive.csv")

In [None]:
# linestring = LineString([[0,1],[0,0]])
# # len(linestring_.coords) or point.distance(
# #     self.linestring_.coords[-1]) < MAX_DISTANCE_TO_END_STOP
# type(linestring.coords[-1])