In [2]:
import numpy as np
import pandas as pd
import glob
import scipy.stats

from joblib import load
from geopy import distance
from sklearn.ensemble import RandomForestClassifier

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
trippath =r'C:\Users\User\Desktop\Data\Testdata\tripsample'
# trippath = r'C:\Users\User\Desktop\Data\test\trip'
Trip_file = glob.glob(trippath + "/*.txt")
logpath =r'C:\Users\User\Desktop\Data\Testdata\logsample'
# logpath = r'C:\Users\User\Desktop\Data\test\log'
Log_file = glob.glob(logpath + "/*.txt")

In [5]:
lon_range = [103.6, 104.042]
lat_range = [1.238, 1.48]

def preprocess(df):
    
    ## Lat Lon ##
    df = df[(df['Lat'] >= lat_range[0]) & (df['Lat'] <= lat_range[1])]
    df = df[(df['Lon'] >= lon_range[0]) & (df['Lat'] <= lon_range[1])]

    
    ## Time ##
    df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d %X")
    df.sort_values(by='Time', inplace=True, ascending=True)

    df['Timedif'] = df['Time'] - df['Time'].shift(1)
    df['Timedif'] = df['Timedif'].astype(np.int64) // 10 ** 9.
    df = df.drop(['Time'], axis=1)
    df = df.reset_index(drop=True)
    df.iloc[0]['Timedif'] = 0
    df.iloc[0]['Timedif'] = np.mean(df['Timedif'])

    
    ## Distance ##
    coords = (df['Lat'], df['Lon'])
    lat = coords[0]
    lon = coords[1]
    distdif = []
    for i in range(0, len(lat) - 1):
        distdif.append(distance.geodesic((lat[i], lon[i]), (lat[i + 1], lon[i + 1])).km)
    mean = np.mean(distdif)
    distdif.insert(0, mean)
    df['Distdif'] = distdif
    
    
    ## Speed ##
    speed = []
    timedif = df['Timedif']
    for i in range(1, len(timedif)):
        s = distdif[i] / timedif[i]
        if s <= 0.0333:
            speed.append(s)
        else:
            speed.append(0.0333)
#         speed.append(distdif[i] / timedif[i]) # km/s
    mean = np.mean(speed)
    speed.insert(0, mean)
    df['Speed'] = speed
    
    
    ## Flag ## - 2 diff source will return 1, same source returns 0
    flag = []
    source = df['Source']
    flag.append(0) # adding 0 to the first value
    for i in range(len(source) - 1):
        if source[i] != source[i + 1]:
            flag.append(1)
        else:
            flag.append(0)
    df['Flag'] = flag    
    
    
    ## Drop first row ##
    df = df.drop([0])
    
    return df

In [6]:
## Handling inf/-inf/NaN/null values + values at the extremities ##
def dropna(df, subset):
    ## Check if any index has a null value ##
#     print(df.notnull().values.all())
#     print(len(df))
    
    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=subset, how="any")
#     print(len(df))
    
    return df

In [7]:
header = ["Mean", "Stdev", "Over60", "O60U120", "Over120", "Skewness", "Kurtosis", "Entropy", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9"]
subset = ["Speed", "Distdif", "Timedif", "Lat", "Lon"]

In [8]:
def further_process(df, timeLimit): # timeLimit in minutes
    out = pd.DataFrame(columns=header)
    val = []
    
    df = dropna(df, subset) #filter away all nan
    df = df[df["Timedif"] <= (timeLimit * 60)]
    df = df[df["Flag"] == 1] #filter for flag == 1
    
    total = len(df)
    if (total == 0):
        return pd.DataFrame()
    speed = df["Speed"]
    npArr = speed.values
    
    ## Mean, std ##
    mean = np.mean(npArr)
    std = np.std(npArr)
    val.append(mean)
    val.append(std)
    
    ## % over ##
    mask120 = (speed >= 0.0333)
    mask60 = (speed >= 0.01666)
    count120 = len(df[mask120])
    count60 = len(df[mask60])
    over60 = count60 / total
    over60under120 = (count60 - count120) / total
    over120 = count120 / total
    val.append(over60)
    val.append(over60under120)
    val.append(over120)
#     print(over120, over60, total)
    
    ## Skewness ##
    val.append(speed.skew(axis=0))
    
    ## Kurtosis ##
    val.append(speed.kurtosis())
    
    ## Entropy ##
    p_data = speed.value_counts()
    entropy = scipy.stats.entropy(p_data)
    val.append(entropy)
    
    ## Quantile ##
    q1 = speed.quantile(.1)
    q2 = speed.quantile(.2)
    q3 = speed.quantile(.3)
    q4 = speed.quantile(.4)
    q5 = speed.quantile(.5)
    q6 = speed.quantile(.6)
    q7 = speed.quantile(.7)
    q8 = speed.quantile(.8)
    q9 = speed.quantile(.9)
    val.append(q1)
    val.append(q2)
    val.append(q3)
    val.append(q4)
    val.append(q5)
    val.append(q6)
    val.append(q7)
    val.append(q8)
    val.append(q9)

    out.loc[0] = val
#     return pd.DataFrame(data=val, dtype=np.float64)
    return out

In [None]:
model = load(r'C:\Users\User\Desktop\Stanley\Training\randomforest60.joblib')

for tripfile in Trip_file:
    string = tripfile.replace(trippath, '')[1: -4]
#     index = int(string[3: -1])
#     if (index >= indexes_to_skip[0] and index <= indexes_to_skip[1]):
#         continue
    labels = pd.DataFrame()
    records = pd.DataFrame(columns=header)
    df1 = pd.read_csv(tripfile, usecols=[1, 2, 3, 4], sep=' ')
    for logfile in Log_file:
        if string == logfile.replace(logpath,'')[1: -4]:
#             print(tripfile.replace(trippath, '')[1:-4])
            df2 = pd.read_csv(logfile, usecols=[1,2,3,4], sep=' ')
            out = preprocess(pd.concat([df1, df2]))
            out = further_process(out, 60) #filter for time <= 60 mins
            if (len(out) != 0):
                records = pd.concat([records, out], ignore_index=True)
                labels = pd.concat([labels, pd.DataFrame(data=[1])], ignore_index=True)
        else:
            df2 = pd.read_csv(logfile, usecols=[1,2,3,4], sep=' ')
            out = preprocess(pd.concat([df1, df2]))
            out = further_process(out, 60) #filter for time <= 60 mins
            if (len(out) != 0):
                records = pd.concat([records, out], ignore_index=True)
                labels = pd.concat([labels, pd.DataFrame(data=[0])], ignore_index=True)
    ## Removing all null values from kurtosis etc ##
    while (not records.notnull().values.all()):
        print("In while loop")
        print(len(records))
        index = records[records["Kurtosis"].isna()].index #change to check for all null, not just Kurtosis
        records = records.drop(index)
        labels = labels.drop(index)
        print(len(records))
    ## Predictions ##
    pred = model.predict(records)
    prob = model.predict_proba(records)
    ## Save ##
    np.savetxt(string + "records.txt", records, delimiter=',')
    np.savetxt(string + "labels.txt", labels, delimiter=',')
    np.savetxt(string + "predictions.txt", pred, delimiter=',')
    np.savetxt(string + "probabilities.txt", prob[:, 1], delimiter=',') 