# Exploratory Data Analysis 

Date: Oct. 31, 2020

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import math
from scipy import spatial

# Natural Language Processing
import re
from nltk.util import ngrams
from nltk import FreqDist

In [87]:
df = pd.read_csv('../../data/floatpop/20190601_23.csv')

In [114]:
class Preprocessor:
    def __init__(self, df):
        self.df = df

    def shift(self):
        """
        Calculate the interval between observations
        """
        self.df = (
            self.df.loc[:,["dailyid", "hour", "minute", "latitude", "longitude"]]
            .sort_values(by=["dailyid", "hour", "minute"])
        )
        self.df["hour_lag"] = self.df.groupby(["dailyid"])["hour"].shift(1)
        self.df["minute_lag"] = self.df.groupby(["dailyid"])["minute"].shift(1)
        self.df["shift"] = 60*(self.df["hour"]-self.df["hour_lag"]) + self.df["minute"]-self.df["minute_lag"]
        self.df.reset_index().drop(["index"], axis=1, inplace=True)
    
    def label_trip(self, th):
        # Calculate Shift
        self.shift()

        arr = self.df.to_numpy()

        trip = np.zeros(len(self.df))
        trip_label = 1
        start_end_idx = [0]
        for i, row in enumerate(arr):
            shft = row[-1]
            if  np.isnan(shft):
                # reset
                trip_id = 1
                trip_label = 1
                trip[i] = trip_label
                # start end index
                if i != 0:
                    start_end_idx.append(i-1)
                    start_end_idx.append(i)
            elif shft >= th:
                trip_label = trip_label + 1
                trip[i] = trip_label
                start_end_idx.append(i-1)
                start_end_idx.append(i)
            else:
                trip[i] = trip_label

        arr = np.column_stack((arr,trip))
        
        self.df = pd.DataFrame(
            data = arr[start_end_idx,:],
            columns = ["dailyid", "hour", "minute", "latitude", "longitude", "hour_lag", "minute_lag", "shift", "trip_label"]
        )

In [115]:
setup = Preprocessor(df)
setup.label_trip(30)
df = setup.df

In [119]:
stations = pd.read_excel("../../station_data/station.xlsx")
mask = [re.match("^(1|20)", addr) is not None for addr in stations.post]
tokyo_stations = stations[mask]

In [139]:
tree = spatial.KDTree(tokyo_stations.loc[:,["lat", "long"]], leafsize=100)

In [140]:
closest_station = []
for lat, lon in zip(df.latitude.to_numpy(), df.longitude.to_numpy()):
    loc = np.array([lat, lon])
    dist, idx = tree.query(loc)
    station_name = tokyo_stations.iloc[idx,:]["station_name"] # get station name
    closest_station.append(station_name)

In [143]:
bigram_fd = FreqDist(list(ngrams(closest_station, 2))[::2])
bigram_df = pd.DataFrame(bigram_fd.most_common(), columns=['bigram', 'count'])
bigram_df = pd.read_csv('../../data/bigram.csv')