In [2]:
import pandas as pd
import numpy as np
from scipy import stats
pd.options.mode.chained_assignment = None 

Utilities class for different anonymous functions used throughout the program

In [3]:
class Utilities:
    def isfloat(a, num):
        try:
            float(num)
            return True
        except ValueError:
            return False


Processing class for the preprocessing functions

In [4]:
class Preprocessing:
    def __init__(self, path) -> None:
        self.file = pd.read_csv(path)
        display(self.file)
        self.util = Utilities()

    def Imputation(self, path):
        FeatureBeforeImpute = self.file[path]
        toFill = [el for el in FeatureBeforeImpute.isnull()]
        featureSum = 0
        featureCount = 0
        for i in range(0, len(toFill)):
            if toFill[i] == False and self.util.isfloat(FeatureBeforeImpute[i]):
                featureSum += float(FeatureBeforeImpute[i])
                featureCount += 1
        avgFeatureVal = featureSum / featureCount
        avgFeatureVal = round(avgFeatureVal, 2)
        for i in range(0, len(toFill)):
            if toFill[i] is True or not self.util.isfloat(FeatureBeforeImpute[i]):
                self.file[path][i] = avgFeatureVal
        display(self.file)

    def AnomalyDetection(self, path):
        z = np.abs(stats.zscore(self.file[path]))
        anomaly = np.where(z > 3)
        for indArr in anomaly:
            for ind in indArr:
                self.file[path][ind]="-"
        self.Imputation(path)
        

    def Normalization(self, path):
        dataInPath=self.file[path]
        maxPath=float(dataInPath.max())
        minPath=float(dataInPath.min())
        for i in range(0,len(dataInPath)):
            self.file[path][i]=round((float(self.file[path][i])-minPath)/(maxPath-minPath),2)
        display(self.file)
        

    def Encoding(self,path):
        Yes=[]
        No=[]
        for i in range(0,len(self.file[path])):
            if(self.file[path][i]=='Yes'):
                Yes.append(1)
                No.append(0)
            if(self.file[path][i]=='No'):
                Yes.append(0)
                No.append(1)
        self.file['Yes']=Yes
        self.file['No']=No
        display(self.file)
    
    
    def Standardization(self,path):
        z=[]
        dataMean=self.file[path].mean()
        dataStdDeviation=self.file[path].std()
        
        for i in range(0,len(self.file[path])):
            zScore=round((float(self.file[path][i])-dataMean)/dataStdDeviation,2)
            z.append(zScore)
        
        self.file[path+'-zScore']=z
        display(self.file)
        
                


Driver code for the program

Create the object with the file path

In [5]:
obj = Preprocessing('./travel-times.csv')

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All,Comments
0,1/6/2012,16:37,Friday,Home,51.29,127.4,78.3,84.8,,39.3,36.3,No,
1,1/6/2012,08:20,Friday,GSK,51.63,130.3,81.8,88.9,,37.9,34.9,No,
2,1/4/2012,16:17,Wednesday,Home,51.27,127.4,82.0,85.8,,37.5,35.9,No,
3,1/4/2012,07:53,Wednesday,GSK,49.17,132.3,74.2,82.9,,39.8,35.6,No,
4,1/3/2012,18:57,Tuesday,Home,51.15,136.2,83.4,88.1,,36.8,34.8,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,7/18/2011,08:09,Monday,GSK,54.52,125.6,49.9,82.4,7.89,65.5,39.7,No,
201,7/14/2011,08:03,Thursday,GSK,50.90,123.7,76.2,95.1,7.89,40.1,32.1,Yes,
202,7/13/2011,17:08,Wednesday,Home,51.96,132.6,57.5,76.7,,54.2,40.6,Yes,
203,7/12/2011,17:51,Tuesday,Home,53.28,125.8,61.6,87.6,,51.9,36.5,Yes,


In [6]:
obj.Imputation('FuelEconomy')

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All,Comments
0,1/6/2012,16:37,Friday,Home,51.29,127.4,78.3,84.8,8.69,39.3,36.3,No,
1,1/6/2012,08:20,Friday,GSK,51.63,130.3,81.8,88.9,8.69,37.9,34.9,No,
2,1/4/2012,16:17,Wednesday,Home,51.27,127.4,82.0,85.8,8.69,37.5,35.9,No,
3,1/4/2012,07:53,Wednesday,GSK,49.17,132.3,74.2,82.9,8.69,39.8,35.6,No,
4,1/3/2012,18:57,Tuesday,Home,51.15,136.2,83.4,88.1,8.69,36.8,34.8,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,7/18/2011,08:09,Monday,GSK,54.52,125.6,49.9,82.4,7.89,65.5,39.7,No,
201,7/14/2011,08:03,Thursday,GSK,50.90,123.7,76.2,95.1,7.89,40.1,32.1,Yes,
202,7/13/2011,17:08,Wednesday,Home,51.96,132.6,57.5,76.7,8.69,54.2,40.6,Yes,
203,7/12/2011,17:51,Tuesday,Home,53.28,125.8,61.6,87.6,8.69,51.9,36.5,Yes,


In [7]:
obj.AnomalyDetection('Distance')

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All,Comments
0,1/6/2012,16:37,Friday,Home,51.29,127.4,78.3,84.8,8.69,39.3,36.3,No,
1,1/6/2012,08:20,Friday,GSK,51.63,130.3,81.8,88.9,8.69,37.9,34.9,No,
2,1/4/2012,16:17,Wednesday,Home,51.27,127.4,82.0,85.8,8.69,37.5,35.9,No,
3,1/4/2012,07:53,Wednesday,GSK,49.17,132.3,74.2,82.9,8.69,39.8,35.6,No,
4,1/3/2012,18:57,Tuesday,Home,51.15,136.2,83.4,88.1,8.69,36.8,34.8,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,7/18/2011,08:09,Monday,GSK,54.52,125.6,49.9,82.4,7.89,65.5,39.7,No,
201,7/14/2011,08:03,Thursday,GSK,50.9,123.7,76.2,95.1,7.89,40.1,32.1,Yes,
202,7/13/2011,17:08,Wednesday,Home,51.96,132.6,57.5,76.7,8.69,54.2,40.6,Yes,
203,7/12/2011,17:51,Tuesday,Home,53.28,125.8,61.6,87.6,8.69,51.9,36.5,Yes,


In [8]:
obj.Normalization('MaxSpeed')

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All,Comments
0,1/6/2012,16:37,Friday,Home,51.29,0.53,78.3,84.8,8.69,39.3,36.3,No,
1,1/6/2012,08:20,Friday,GSK,51.63,0.63,81.8,88.9,8.69,37.9,34.9,No,
2,1/4/2012,16:17,Wednesday,Home,51.27,0.53,82.0,85.8,8.69,37.5,35.9,No,
3,1/4/2012,07:53,Wednesday,GSK,49.17,0.70,74.2,82.9,8.69,39.8,35.6,No,
4,1/3/2012,18:57,Tuesday,Home,51.15,0.84,83.4,88.1,8.69,36.8,34.8,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,7/18/2011,08:09,Monday,GSK,54.52,0.47,49.9,82.4,7.89,65.5,39.7,No,
201,7/14/2011,08:03,Thursday,GSK,50.9,0.40,76.2,95.1,7.89,40.1,32.1,Yes,
202,7/13/2011,17:08,Wednesday,Home,51.96,0.71,57.5,76.7,8.69,54.2,40.6,Yes,
203,7/12/2011,17:51,Tuesday,Home,53.28,0.47,61.6,87.6,8.69,51.9,36.5,Yes,


In [9]:
obj.Encoding('Take407All')

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All,Comments,Yes,No
0,1/6/2012,16:37,Friday,Home,51.29,0.53,78.3,84.8,8.69,39.3,36.3,No,,0,1
1,1/6/2012,08:20,Friday,GSK,51.63,0.63,81.8,88.9,8.69,37.9,34.9,No,,0,1
2,1/4/2012,16:17,Wednesday,Home,51.27,0.53,82.0,85.8,8.69,37.5,35.9,No,,0,1
3,1/4/2012,07:53,Wednesday,GSK,49.17,0.70,74.2,82.9,8.69,39.8,35.6,No,,0,1
4,1/3/2012,18:57,Tuesday,Home,51.15,0.84,83.4,88.1,8.69,36.8,34.8,No,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,7/18/2011,08:09,Monday,GSK,54.52,0.47,49.9,82.4,7.89,65.5,39.7,No,,0,1
201,7/14/2011,08:03,Thursday,GSK,50.9,0.40,76.2,95.1,7.89,40.1,32.1,Yes,,1,0
202,7/13/2011,17:08,Wednesday,Home,51.96,0.71,57.5,76.7,8.69,54.2,40.6,Yes,,1,0
203,7/12/2011,17:51,Tuesday,Home,53.28,0.47,61.6,87.6,8.69,51.9,36.5,Yes,,1,0


In [10]:
obj.Standardization('TotalTime')

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All,Comments,Yes,No,TotalTime-zScore
0,1/6/2012,16:37,Friday,Home,51.29,0.53,78.3,84.8,8.69,39.3,36.3,No,,0,1,-0.38
1,1/6/2012,08:20,Friday,GSK,51.63,0.63,81.8,88.9,8.69,37.9,34.9,No,,0,1,-0.58
2,1/4/2012,16:17,Wednesday,Home,51.27,0.53,82.0,85.8,8.69,37.5,35.9,No,,0,1,-0.64
3,1/4/2012,07:53,Wednesday,GSK,49.17,0.70,74.2,82.9,8.69,39.8,35.6,No,,0,1,-0.31
4,1/3/2012,18:57,Tuesday,Home,51.15,0.84,83.4,88.1,8.69,36.8,34.8,No,,0,1,-0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,7/18/2011,08:09,Monday,GSK,54.52,0.47,49.9,82.4,7.89,65.5,39.7,No,,0,1,3.44
201,7/14/2011,08:03,Thursday,GSK,50.9,0.40,76.2,95.1,7.89,40.1,32.1,Yes,,1,0,-0.26
202,7/13/2011,17:08,Wednesday,Home,51.96,0.71,57.5,76.7,8.69,54.2,40.6,Yes,,1,0,1.80
203,7/12/2011,17:51,Tuesday,Home,53.28,0.47,61.6,87.6,8.69,51.9,36.5,Yes,,1,0,1.46
