# Volvo Trucks Analytics

### Imports and global variables go here

In [1]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import datetime

#The two truck data set paths are set here.
truck1csv = "../data/trucks/truck1.csv"
truck2csv = "../data/trucks/truck2.csv"

#The two column header dictonary paths are set here.
truck1dictcsv = "../data/dictionary/truck1dict.csv"
truck2dictcsv = "../data/dictionary/truck2dict.csv"

#Number of records desired from the data set.
numberOfRecords = 100

#If the number of values that are NOT Na type pass this percentage, the row will be deleted
rowNaNThresholdPercent = 75

#Create column conversion dictionaries from CSV files.
truck1dict={}
truck2dict={}
with open(truck1dictcsv, 'r') as f:
    for row in csv.reader(f):
        truck1dict.update({row[0]:row[1]})
        
with open(truck2dictcsv, 'r') as f:
    for row in csv.reader(f):
        truck2dict.update({row[0]:row[1]})


### Define all cleaning functions here

In [2]:
def readCsv(truck):
    return pd.read_csv(truck, header = [0])
    
def cullUtcCols(truckData):
    noUTC = truckData.drop(["UTC hour", "UTC minute", "UTC second", "UTC month", "UTC day", "UTC year"], axis = 1)
    return noUTC

"""Takes a DataFrame and a conversion dictionary as parameters;
    uses the dictionary to rename all matching columns then returns the changed DataFrame."""
def renameColumns(dataFrame, dictionary):
    return dataFrame.rename(columns = dictionary)

"""Finds threshold of Non-NA type using percentage and deletes rows."""
def removeUnnecessaryRows(dataFrame):
    length = len(dataFrame.columns)
    TValue = (rowNaNThresholdPercent / 100) * length
    print("Threshold value: " + str(int(TValue)))
    return dataFrame.dropna(thresh = int(TValue))

"""Finds columns with all Na types and deletes them."""
def removeUnnecessaryColumns(dataFrame):
    return dataFrame.dropna(axis=1, how='all')

# NOTE: The following function is unused in this notebook, but it can be used to convert CSV files into
# conversion dictionaries i.e. creating the column rename dictionaries, which was manually done in the
# previous block manually.

"""Creates a Python dictionary from a pre-defined CSV dictionary.
This will only look at the first two columns of the given CSV file."""
def createPythonDictionary(dictionaryCSV):
    # Initialize the resulting dictionary.
    dictResult = {}
    
    # Opens the passed in CSV which defines the rename dictionary and iterates through it to store each value into a Python dictionary.
    with open(dictionaryCSV, 'r') as f:
        for row in csv.reader(f):
            dictResult.update({row[0]:row[1]})
    return dictResult

### Cleaning of Truck 1 Data starts here

In [3]:
truck1data = readCsv(truck1csv)
truck1data = cullUtcCols(truck1data)
truck1data = removeUnnecessaryRows(truck1data)
truck1data = renameColumns(truck1data, truck1dict)
truck1data = removeUnnecessaryColumns(truck1data)

# After cleaning, check the shape of the dataframe
truck1data.shape

KeyboardInterrupt: 

In [None]:
truck1data.head()

In [None]:
truck1data

In [None]:
print(truck1data.describe())

### Cleaning of Truck 2 Data starts here

In [None]:
truck2data = readCsv(truck2csv)
truck2data = cullUtcCols(truck2data)
truck2data = removeUnnecessaryRows(truck2data)
truck2data = renameColumns(truck2data, truck2dict)
truck2data = removeUnnecessaryColumns(truck2data)

# After cleaning, check the shape of the dataframe
truck2data.shape

In [None]:
truck2data.head()

In [None]:
truck2data

In [None]:
print(truck2data.describe())

### Concatenation of Truck1 and Truck2 Data

In [None]:
df = pd.concat([truck1data, truck2data], sort=False)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print(df.describe())

### Group Truck1 by Weight

In [None]:
byWeight = truck1data.groupby('Vehicle Weight (kg)')

In [None]:
byWeight.head()

In [None]:
print(byWeight.describe())

### Group Truck2 by Weight

In [None]:
byWeight2 = truck2data.groupby('Vehicle Weight (kg)', axis=0)

In [None]:
byWeight2.head()

In [None]:
print(byWeight2.describe())

In [None]:
byWeight2 = truck2data.groupby('Vehicle Weight (kg)')

for key, item in byWeight2:
    print(byWeight2.get_group(key), "\n\n")

In [None]:
df = truck1data

This sums all the data around the grouping by 'Vehicle Weight'

In [None]:
df.groupby(['Vehicle Weight (kg)'], as_index=False).sum()

In [None]:
df2 = truck2data

This groups by the 'Vehicle Weight' then uses the mean as the new values in the rest of the columns

In [None]:
df2.groupby(['Vehicle Weight (kg)'], as_index=False).mean()

In [None]:
df.groupby(['Vehicle Weight (kg)'], as_index=False).mean()

In [None]:
dftest = truck1data

In [None]:
dftest.groupby(['Vehicle Weight (kg)'], as_index=False).mean()

In [None]:
truckonetype =  pd.to_datetime(truck1data['Time (DateTime)'])

truckonetype.head(100)



In [None]:
truckTwotype =  pd.to_datetime(truck2data['Time (DateTime)'])

truckTwotype.head(100)

For the function `divideByDay()` make sure to pass in a DataFrame of a truck.
If you want specific column of data then just specify it by using quotation and the accurate name of the column.
Example:
This line will get daily averages for all columns:
`print(divideByDay(truck1data)`
This line will get daily average of speed:
`print(divideByDay(truck1data, "Speed (km/hr)"))`

In [None]:
def divideByDay(truck_df, byday_df=None):
    dates = []
    #truck_df = truck_df.head()
    for index, tdata in truck_df.iterrows():
        dates.append(tdata['Time (DateTime)'].split(' ')[0])
        #print(index, dates)
        
    truck_df['Time (DateTime)'] = dates
    
    if(byday_df):
        byday_df = truck_df.groupby('Time (DateTime)')[byday_df].mean()
    else:
        byday_df = truck_df.groupby('Time (DateTime)').mean()

    return byday_df

Truck 2 daily Average speed 

In [None]:
print(divideByDay(truck2data, "Speed (km/hr)"))

Truck 1 daily Average speed 

In [None]:
print(divideByDay(truck1data, "Speed (km/hr)"))