In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy.stats import ttest_ind as ttest
import scipy.stats

#### Checking the shape of data frame

In [None]:
df.shape

In [None]:
df.info()

#### Checking the null values

In [None]:
df.isnull().sum()

#### Removing Null values

In [None]:
df = df.dropna(how='any')
df = df.reset_index(drop=True)

In [None]:
df.describe()

### Merging rows on the basis of Trip ID, Source ID, Destination ID

In [None]:
df_new_tsd = df.groupby(['trip_uuid','source_center','destination_center']).aggregate(
    {
        'trip_creation_time':'last',
        'source_name':'first',
        'destination_name':'first',
        'od_start_time':'first',
        'od_end_time':'first',
        'start_scan_to_end_scan':'first',
        'actual_distance_to_destination':'max',
        'actual_time':'max','osrm_time':'max',
        'osrm_distance':'max',
        'segment_actual_time':'sum',
        'segment_osrm_time':'sum',
        'segment_osrm_distance':'sum'})

In [None]:
df_new_tsd.reset_index(inplace=True)

In [None]:
df_new_tsd.info()

In [None]:
df_new_tsd.isnull().sum()

#### Calculating Trip duration from start time and end time

In [None]:
df_new_tsd['Trip start'] = [datetime.strptime(x,"%Y-%m-%d %H:%M:%S.%f") for x in df_new_tsd['od_start_time']]
df_new_tsd['Trip end'] = [datetime.strptime(x,"%Y-%m-%d %H:%M:%S.%f") for x in df_new_tsd['od_end_time']]

In [None]:
df_new_tsd['Trip duration'] = df_new_tsd['Trip end'] - df_new_tsd['Trip start']

In [None]:
df_new_tsd['Trip duration'] = [x.total_seconds()/60 for x in df_new_tsd['Trip duration']]

In [None]:
df_new_tsd = df_new_tsd.reset_index()

### Merging the rows based on Trip ID

In [None]:
df_new = df_new_tsd.groupby('trip_uuid').aggregate(
    {
        'source_center':'last',
        'destination_center':'first',
        'trip_creation_time':'first',
        'source_name':'last',
        'destination_name':'first',
        'Trip duration':'sum',
        'start_scan_to_end_scan':'sum',
        'actual_distance_to_destination':'sum',
        'actual_time':'sum',
        'osrm_time':'sum',
        'osrm_distance':'sum',
        'segment_actual_time':'sum',
        'segment_osrm_time':'sum',
        'segment_osrm_distance':'sum'})

In [None]:
df_new.reset_index(inplace=True)

### Extracting State and City from source and destination

In [None]:
def ext(x):
    return x.split('(')[1][:-1]

In [None]:
df_new['source state'] = df_new['source_name'].transform(ext)
df_new['destination state'] = df_new['destination_name'].transform(ext)

In [None]:
def ext2(x):
    if '_' in x:
        return x.split('_')[0]
    else:
        return x.split(' ')[0]

In [None]:
df_new['source city'] = df_new['source_name'].transform(ext2)
df_new['destination city'] = df_new['destination_name'].transform(ext2)

In [None]:
df_new['source city'].replace('FBD','Faridabad',inplace=True)

### Converting Trip creation time to datetime

In [None]:
df_new['trip creation time'] = pd.to_datetime(df_new['trip_creation_time'])

In [None]:
df_new.drop(['trip_creation_time'],axis=1,inplace=True)

In [None]:
df_new.info()

#### Extracting Year, Month and Day from Trip creation time

In [None]:
df_new['Trip creation year'] = df_new['trip creation time'].dt.year
df_new['Trip creation month'] = df_new['trip creation time'].dt.month
df_new['Trip creation day'] = df_new['trip creation time'].dt.day

In [None]:
df_new.info()

# Conducting hypotheses testing

## Trip duration and  start scan to end scan

Null Hypotheses, Ho: Trip duration is equal to start scan to end scan.

Alternate Hypotheses, Ha: Trip duration is not equal to start scan to end scan

Significance level: 0.05

In [None]:
test_stats, p_val = ttest(df_new['Trip duration'],df_new['start_scan_to_end_scan'],equal_var=False,alternative='two-sided')

critical_value = scipy.stats.t.ppf(0.95,df=14817)

print('Test statistic: ',test_stats)
print('P-value: ',p_val)
print('Critical value: ',critical_value)
print("\n")
if test_stats < critical_value:
    print("Fail to reject Null hypotheses")
else:
    print('Reject Null hypotheses')

#### Hence trip duration is equal to start scan to end scan

## Actual time and OSRM time

Null Hypotheses, Ho: Actual time is equal to osrm time.

Alternate Hypotheses, Ha: Actual time is not equal to osrm time.

Significance level: 0.05

In [None]:
test_stats, p_val = ttest(df_new['actual_time'],df_new['osrm_time'],equal_var=False,alternative='two-sided')

critical_value = scipy.stats.t.ppf(0.95,df=14817)

print('Test statistic: ',test_stats)
print('P-value: ',p_val)
print('Critical value: ',critical_value)
print("\n")
if test_stats < critical_value:
    print("Fail to reject Null hypotheses")
else:
    print('Reject Null hypotheses')

#### Hence actual time is not equal to osrm time

## Actual time and segment actual time

Null Hypotheses, Ho: Actual time is equal to segment actual time.

Alternate Hypotheses, Ha: Actual time is not equal to segment acutal time.

Significance level: 0.05

In [None]:
test_stats, p_val = ttest(df_new['actual_time'],df_new['segment_actual_time'],equal_var=False,alternative='two-sided')

critical_value = scipy.stats.t.ppf(0.95,df=14817)

print('Test statistic: ',test_stats)
print('P-value: ',p_val)
print('Critical value: ',critical_value)
print("\n")
if test_stats < critical_value:
    print("Fail to reject Null hypotheses")
else:
    print('Reject Null hypotheses')

#### Hence actual time is equal to segment actual time

## Osrm distance and segment osrm distance

Null Hypotheses, Ho: Osrm distance is equal to segment osrm distance.

Alternate Hypotheses, Ha: Osrm distance is not equal to segment osrm distance.

Significance level: 0.05

In [None]:
test_stats, p_val = ttest(df_new['osrm_distance'],df_new['segment_osrm_distance'],equal_var=False,alternative='two-sided')

critical_value = scipy.stats.t.ppf(0.95,df=14817)

print('Test statistic: ',test_stats)
print('P-value: ',p_val)
print('Critical value: ',critical_value)
print("\n")
if test_stats < critical_value:
    print("Fail to reject Null hypotheses")
else:
    print('Reject Null hypotheses')

#### Hence osrm distance is equal to segment osrm distance

## Osrm time and segment osrm time

Null Hypotheses, Ho: Osrm time is equal to segment osrm time.

Alternate Hypotheses, Ha: Osrm time is not equal to segment osrm time.

Significance level: 0.05

In [None]:
test_stats, p_val = ttest(df_new['osrm_time'],df_new['segment_osrm_time'],equal_var=False,alternative='two-sided')

critical_value = scipy.stats.t.ppf(0.95,df=14817)

print('Test statistic: ',test_stats)
print('P-value: ',p_val)
print('Critical value: ',critical_value)
print("\n")
if test_stats < critical_value:
    print("Fail to reject Null hypotheses")
else:
    print('Reject Null hypotheses')

#### Hence osrm time is equal to segment osrm time

# Outliers

In [None]:
num_cols = ['start_scan_to_end_scan', 'actual_distance_to_destination', 'actual_time', 'osrm_time',
            'osrm_distance', 'segment_actual_time', 'segment_osrm_distance',
            'segment_osrm_time','Trip duration']

In [None]:
df_new[num_cols].boxplot(rot=25, figsize=(25,8))
plt.show()

### Handling the outliers using IQR method.

In [None]:
Q1 = df_new[num_cols].quantile(0.25)
Q3 = df_new[num_cols].quantile(0.75)
IQR = Q3-Q1

df_new = df_new[~((df_new[num_cols] < (Q1 - 1.5 * IQR)) | (df_new[num_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
df_new = df_new.reset_index(drop=True)

df_new[num_cols].boxplot(rot=0,figsize=(25,8))
plt.show()

# One hot encoding

In [None]:
r = df.groupby('trip_uuid')['route_type'].first().to_frame().reset_index()

In [None]:
df_new = df_new.merge(r,how='left',on='trip_uuid')

In [None]:
pd.concat([df_new['route_type'], pd.get_dummies(df_new['route_type'])], axis=1)

In [None]:
y = pd.get_dummies(df_new['route_type'], drop_first=True, prefix='route_type')
y

In [None]:
df_new = pd.concat([df_new,y],axis=1)

In [None]:
df_new.info()

# Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_new[num_cols])
df_new[num_cols] = scaler.transform(df_new[num_cols])
df_new[num_cols]