## Import Python Libraries

In [2]:
# for generating random variables
import numpy as np

# for plotting
import matplotlib.pyplot as plt 

# for managing data
import pandas as pd

# for plotting
import seaborn as sns 

# for t-tests and ANOVA
import scipy.stats as stats

# for learning algorithms
import sklearn

# for encoding categorical variables
import sklearn.preprocessing as pre

# for splitting into training and test sets
import sklearn.model_selection as mod

%matplotlib inline

## Load the Dataset into Pandas

In [3]:
# read in opel_corsa_01 csv file
df = pd.read_csv("opel_corsa_01.csv", sep=";")
# display the first 5 rows of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage,roadSurface,traffic,drivingStyle
0,59,-2.299988,25.670519,13.223501,121.59269,-2.47698,0.3555,4.705883,68,106,1796,15.81,24,-0.1133,19.497335,SmoothCondition,LowCongestionCondition,EvenPaceStyle
1,60,-2.099976,24.094259,13.638919,120.422571,-1.57626,0.4492,10.588236,68,103,1689,14.65,22,-0.1289,19.515722,SmoothCondition,LowCongestionCondition,EvenPaceStyle
2,61,-1.5,22.743179,14.031043,118.456769,-1.35108,0.4258,27.450981,68,103,1599,11.85,21,-0.1328,19.441765,SmoothCondition,LowCongestionCondition,EvenPaceStyle
3,62,0.100037,22.29282,14.171073,117.571308,-0.450359,0.414,24.313726,69,104,1620,12.21,20,-0.0859,19.388769,SmoothCondition,LowCongestionCondition,EvenPaceStyle
4,63,0.099976,23.6439,14.328954,117.074149,1.35108,0.3945,20.0,69,104,1708,11.91,21,-0.0664,19.301638,SmoothCondition,LowCongestionCondition,EvenPaceStyle


In [4]:
# check for any missing values in the dataset - a clean dataset is required for analysis
df.isnull().sum()

Unnamed: 0                   0
AltitudeVariation            0
VehicleSpeedInstantaneous    0
VehicleSpeedAverage          0
VehicleSpeedVariance         0
VehicleSpeedVariation        0
LongitudinalAcceleration     0
EngineLoad                   0
EngineCoolantTemperature     0
ManifoldAbsolutePressure     0
EngineRPM                    0
MassAirFlow                  0
IntakeAirTemperature         0
VerticalAcceleration         0
FuelConsumptionAverage       0
roadSurface                  0
traffic                      0
drivingStyle                 0
dtype: int64

There are no missing values

In [5]:
# check the data type of all the variables
df.dtypes

Unnamed: 0                     int64
AltitudeVariation            float64
VehicleSpeedInstantaneous    float64
VehicleSpeedAverage          float64
VehicleSpeedVariance         float64
VehicleSpeedVariation        float64
LongitudinalAcceleration     float64
EngineLoad                   float64
EngineCoolantTemperature       int64
ManifoldAbsolutePressure       int64
EngineRPM                      int64
MassAirFlow                  float64
IntakeAirTemperature           int64
VerticalAcceleration         float64
FuelConsumptionAverage       float64
roadSurface                   object
traffic                       object
drivingStyle                  object
dtype: object

A mixture of datatypes - integers, floats and objects.

In [26]:
# drop the first column, roadSurface output column and traffic output column from the dataframe
corsa1 = df.drop(columns=['Unnamed: 0', 'roadSurface', 'traffic'], axis=1)
# display the first 5 rows of corsa1
corsa1.head()

Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage,drivingStyle
0,-2.299988,25.670519,13.223501,121.59269,-2.47698,0.3555,4.705883,68,106,1796,15.81,24,-0.1133,19.497335,EvenPaceStyle
1,-2.099976,24.094259,13.638919,120.422571,-1.57626,0.4492,10.588236,68,103,1689,14.65,22,-0.1289,19.515722,EvenPaceStyle
2,-1.5,22.743179,14.031043,118.456769,-1.35108,0.4258,27.450981,68,103,1599,11.85,21,-0.1328,19.441765,EvenPaceStyle
3,0.100037,22.29282,14.171073,117.571308,-0.450359,0.414,24.313726,69,104,1620,12.21,20,-0.0859,19.388769,EvenPaceStyle
4,0.099976,23.6439,14.328954,117.074149,1.35108,0.3945,20.0,69,104,1708,11.91,21,-0.0664,19.301638,EvenPaceStyle


In [25]:
# count the number of categories in drivingStyle variable
pd.value_counts(corsa1.drivingStyle)

EvenPaceStyle      5751
AggressiveStyle    1287
Name: drivingStyle, dtype: int64

The drivingStyle variable has two categories.

In [24]:
# one hot encode the drivingStyle column
pd.get_dummies(corsa1['drivingStyle'], prefix='drivingStyle')

Unnamed: 0,drivingStyle_AggressiveStyle,drivingStyle_EvenPaceStyle
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
7033,0,1
7034,0,1
7035,0,1
7036,0,1


### Descriptive Statistics
Now that the dataset has been loaded into a Panda's dataframe, I will further analyse the dataset to learn more about it. The Panda's describe function produces the descriptive statistics on the dataset. The mean, standard deviation, maximum number, minimum number, 25% quartile range (median), 50% quartile range and 75% quartile range are produced for each of the 14 numeric variables.

In [27]:
# descriptive statistics are obtained using the describe function. Adding the .T swaps the rows and columns.
corsa1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AltitudeVariation,7038.0,-0.675845,1.691601,-9.200012,-1.5,-0.399963,0.100037,5.200012
VehicleSpeedInstantaneous,7038.0,36.428319,32.901312,0.0,8.782019,29.273399,54.043198,124.749725
VehicleSpeedAverage,7038.0,36.723932,29.366391,0.0,16.698035,28.312631,47.595544,121.330733
VehicleSpeedVariance,7038.0,213.004353,205.717663,0.0,54.333652,144.864363,299.377339,1051.789888
VehicleSpeedVariation,7038.0,-0.029563,2.390997,-17.789218,-0.900722,0.0,0.900721,12.384899
LongitudinalAcceleration,7038.0,0.14353,0.744697,-2.38,-0.3398,0.1408,0.6836,2.36
EngineLoad,7038.0,26.487416,19.46275,0.0,13.725491,25.490196,34.901962,100.0
EngineCoolantTemperature,7038.0,77.924979,7.076616,40.0,79.0,80.0,81.0,85.0
ManifoldAbsolutePressure,7038.0,116.234157,20.660674,98.0,102.0,109.0,122.0,252.0
EngineRPM,7038.0,1569.145354,551.406613,752.0,936.0,1659.5,2033.0,3104.0


It is difficult to obtain a clear picture of the dataset using the basic raw descriptive statistics - it is not very informative in this state. Visualisation tools provide a much better way to analyse the dataset and will give a better understanding of the data.

### Visualisations

Visual tools such as histograms, boxplots and scatterplots make it easier to visualise the distribution of variables.



In [28]:
list(corsa1.columns)

['AltitudeVariation',
 'VehicleSpeedInstantaneous',
 'VehicleSpeedAverage',
 'VehicleSpeedVariance',
 'VehicleSpeedVariation',
 'LongitudinalAcceleration',
 'EngineLoad',
 'EngineCoolantTemperature',
 'ManifoldAbsolutePressure',
 'EngineRPM',
 'MassAirFlow',
 'IntakeAirTemperature',
 'VerticalAcceleration',
 'FuelConsumptionAverage',
 'drivingStyle']

In [29]:
# run the Panda's corr function on the corsa1 dataframe - method Pearson
corsa1.corr(method='spearman')

Unnamed: 0,AltitudeVariation,VehicleSpeedInstantaneous,VehicleSpeedAverage,VehicleSpeedVariance,VehicleSpeedVariation,LongitudinalAcceleration,EngineLoad,EngineCoolantTemperature,ManifoldAbsolutePressure,EngineRPM,MassAirFlow,IntakeAirTemperature,VerticalAcceleration,FuelConsumptionAverage
AltitudeVariation,1.0,-0.319551,-0.250295,-0.014411,0.030279,-0.039759,0.053917,-0.033505,-0.221076,-0.250762,-0.245479,-0.062743,0.040391,0.169858
VehicleSpeedInstantaneous,-0.319551,1.0,0.705998,0.002011,0.079243,-0.229822,-0.02017,0.208261,0.780221,0.846458,0.8126,0.197765,0.250414,-0.453682
VehicleSpeedAverage,-0.250295,0.705998,1.0,0.086737,-0.083996,0.168101,0.012007,0.074315,0.468038,0.503367,0.508949,0.009103,-0.132582,-0.654158
VehicleSpeedVariance,-0.014411,0.002011,0.086737,1.0,0.040557,-0.094234,-0.04435,0.033921,0.031496,0.037355,0.059351,-0.11639,0.060504,-0.11799
VehicleSpeedVariation,0.030279,0.079243,-0.083996,0.040557,1.0,-0.293549,0.422436,-0.023603,0.419781,0.288914,0.315501,0.198793,0.250819,0.030391
LongitudinalAcceleration,-0.039759,-0.229822,0.168101,-0.094234,-0.293549,1.0,-0.082985,0.089289,-0.288621,-0.320015,-0.295146,-0.12328,-0.848036,-0.129249
EngineLoad,0.053917,-0.02017,0.012007,-0.04435,0.422436,-0.082985,1.0,0.036234,0.252521,0.058981,0.142899,0.148695,0.075072,-0.022496
EngineCoolantTemperature,-0.033505,0.208261,0.074315,0.033921,-0.023603,0.089289,0.036234,1.0,0.18378,0.173078,0.125216,0.182893,0.023757,-0.206247
ManifoldAbsolutePressure,-0.221076,0.780221,0.468038,0.031496,0.419781,-0.288621,0.252521,0.18378,1.0,0.855756,0.926622,0.175393,0.312765,-0.286088
EngineRPM,-0.250762,0.846458,0.503367,0.037355,0.288914,-0.320015,0.058981,0.173078,0.855756,1.0,0.876982,0.297947,0.319261,-0.332852
