In [3]:
# Pre-processing:
# It is the process of converting or mapping data from one “raw” form into another format for further analysis.
# It involves:
# > Identify and handle missing values.
# > Data Formatting
# > Data Normalization
# > Data Binning
# > Turnin categorical values to numerical values

In [31]:
import pandas as pd
import numpy as np
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data" 
df = pd.read_csv(url, header=None)
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
df.columns = headers
df.head(5)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [32]:
# In Python, we usually perform operations along columns; each row of the column represents a sample, i.e., a different 
# used car in the database.

# You can access a column by specifying the name of the column.
df["symboling"].head()
# You can also add a value to each entry of a column.
df["symboling"]+1 # Adds 1 to each “symboling” entry

0      4
1      4
2      2
3      3
4      3
5      3
6      2
7      2
8      2
9      1
10     3
11     1
12     1
13     1
14     2
15     1
16     1
17     1
18     3
19     2
20     1
21     2
22     2
23     2
24     2
25     2
26     2
27     2
28     0
29     4
      ..
175    0
176    0
177    0
178    4
179    4
180    0
181    0
182    3
183    3
184    3
185    3
186    3
187    3
188    3
189    4
190    4
191    1
192    1
193    1
194   -1
195    0
196   -1
197    0
198   -1
199    0
200    0
201    0
202    0
203    0
204    0
Name: symboling, Length: 205, dtype: int64

In [33]:
# Dealing with missing values:
# Missing values occur when no data value is stored for a feature or for a particular observation.
# Usually missing values in a dataset appear as “?”, “N/A”, 0 or just a blank cell.

# We can deal with missing data in one of many ways:
# > The first is to check if the person or group that collected the data can go back and find what the actual value should be.
# > Another possibility is just to remove the data where that missing value is found. When you drop data, you can either
# drop the whole variable or just the single data entry with the missing value.

# If you’re removing data, you want to look to do something that has the least amount of impact. Replacing data is better, 
# since no data is wasted. However, it is less accurate since we need to replace missing data with a guess of what the data
# should be.

# One standard replacement technique is to replace missing values by the average value of the entire variable.
# In cases where the average is not possible, as with categorical variables, we can try using the mode - the most common.
# We can replace the missing data using other functions too. This is usually because the data gatherer knows something 
# additional about the missing data.
# Finally, in some cases we can leave the missing data as it is.

In [34]:
# Dropping missing values:
# axis=0 - drops the entire row
# axis=1 - drops the entire column
# subset - mention the column to check the missing values in.
# inplace=True - makes the changes in the existing df itself.

# the dropna() works only on np.NaN or pd.NaT values. But in our case the missing values are referred by '?'.  
df.replace("?", np.nan, inplace=True)
df.dropna(subset=["price"], axis=0, inplace=True)

# Replace with mean:
# We can replace the missing values in the 'normalized-losses' column with the mean of the column.
# But the values in the column are of type "object". So we first convert them to "float" and then we find the mean.
nlMean = df["normalized-losses"].astype("float").mean() # default axis for mean is 0.
# df["normalized-losses"].astype("float") returns a Series. Thus we find the mean along the x-axis
df.replace(np.nan, nlMean, inplace=True)

In [35]:
# Data Formatting:
# Data formatting means bringing data into a common standard of expression that allows users to make meaningful comparisons.
# As a part of dataset cleaning, data formatting ensures that data is consistent and easily understandable.

# In our used car dataset, there’s a feature named “city-mpg” in the dataset, which refers to a car fuel consumption in 
# miles per gallon unit. However, we may want to use metric units.
# So you would want to convert those values to L/100km --the metric version. To transform mpg to L/100km we need to 
# divide 235 by each value in the city-mpg column. In Python, this can easily be done in one line of code.
df["city-mpg"] = 235/df["city-mpg"] # this performs the mathematical calculation on the values of the column "city-mpg"
df.rename(columns={"city-mpg":"city-L/Km"}, inplace=True) # This renames the column name.
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-L/Km,highway-mpg,price
0,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,11.190476,27,13495
1,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,11.190476,27,16500
2,1,122,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,12.368421,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,9.791667,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,13.055556,22,17450


In [36]:
# Data Normalisation:
# More often than not, some of the attributes in our dataset may vary in range. 
# Ex., in the used car data set, the feature “length” ranges from 150 to 250, while feature “width” and “height” ranges 
# from 50 to 100. The varying scale affects the impact of these attributes.
# +--------+-------------+------------+------------+
# |  Scale | [150 - 250] | [50 - 100] | [50 - 100] |
# +--------+-------------+------------+------------+
# | Impact |    Large    |    Small   |    Small   |
# +--------+-------------+------------+------------+
# We may want to normalize these variables so that the range of the values is consistent. This normalization can make some 
# statistical analyses easier down the road.
# By making the ranges consistent between variables, normalization enables a fairer comparison between the different 
# features. Making sure they have the same impact, it is also important for computational reasons.

# There are several ways to normalise data. 3 of those are:
# > Simple feature scaling:
# It just divides each value by the maximum value for that feature. x_new = x_old / x_max
# This makes the new values range between 0 and 1.
# > Min-Max:
# Takes each value, X_old, subtracted from the minimum value of that feature, then divides by the range of that feature.
# x_new = (x_old - x_min) / (x_max - x_min)
# Again, the resulting new values range between 0 and 1.
# > Z-score (Standard score):
# For each value, you subtract the Mu which is the average of the feature, and then divide by the standard deviation (sigma).
# x_new = (x_old - mu) / sigma
# The resulting values hover around 0, and typically range between -3 and +3, but can be higher or lower.

# Applying the various methods on the "length" column of df:
lenSF = df["length"] / df["length"].max() # Simple feature scaling
lenMM = (df["length"] - df["length"].min()) / (df["length"].max() - df["length"].min()) # Min-Max scaling
lenZ = (df["length"] - df["length"].mean()) / df["length"].std() # Z score scaling

In [38]:
# Binning:
# Binning is when you group values together into bins. For example, you can bin “age” into [0 to 5], 
# [6 to 10], [11 to 15] and so on.
# Converts numeric into categorical variables i.e., group a set of numeric values into bins.
# Sometimes, binning can improve accuracy of the predictive models.

# In the actual car dataset, ”price" is a numerical variable ranging from 5188 to 45400, it has 201 unique values.
# We can categorize them into 3 bins: low, medium, and high-priced cars.
# In Python we can easily implement the binning: 
# First we need to change the type of price from object to float.
df["price"] = df["price"].astype("float")
# We would like 3 bins of equal binwidth, so we need 4 numbers as dividers that are equal distance apart.
# We use linspace() to get the dividers.
bins = np.linspace(df["price"].max(), df["price"].min(),4)
bins # array([45400, 31972.66666667, 18545.33333333,  5118])
binNames = ["Low", "Medium", "High"]
# We can use the pandas function ”cut” to segment and sort the data values into bins.
#df["price-binned"] = pd.cut(df["price"], bins, labels=binNames, include_lowest=True)
# The above function is correct, but it throws a ValueError

In [41]:
# Turning categorical variables into quantitative variables:
# Most statistical models cannot take in objects or strings as input and, for model training,only take the numbers as inputs.
# In the car dataset, the "fuel-type" feature as a categorical variable has two values, "gas" or "diesel”, which are in String format.
# To convert these variables into some form of numeric format, we encode the values by adding new features corresponding
# to each unique element in the original feature we would like to encode.

# In the case where the feature “Fuel” has two unique values, gas and diesel, we create two new features ‘gas’ and ‘diesel.'
# When a value occurs in the original feature we set the corresponding value to 1 in the new feature; the rest of the 
# features are set to zero. In the fuel example, for car B, the fuel value is diesel. Therefore, we set the feature diesel 
# equal to one and the gas feature to zero. Similarly, for car D the fuel value is gas. Therefore we set the feature gas 
# equal to one and the feature diesel equal to zero. This technique is often called “one-hot encoding”.

# In pandas, we can use get_dummies() method to convert categorical variables to dummy variables.
pd.get_dummies(df["fuel-type"]).head()

Unnamed: 0,diesel,gas
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
