In [None]:
# https://youtu.be/x08AN87G0mg?si=RyMj8-EFKiuAekFa

In [1]:
import numpy as np
import pandas as pd

In [3]:
dataset = pd.read_csv('futuristic_city_traffic.csv')
dataset.head(5)

Unnamed: 0,City,Vehicle Type,Weather,Economic Condition,Day Of Week,Hour Of Day,Speed,Is Peak Hour,Random Event Occurred,Energy Consumption,Traffic Density
0,SolarisVille,Drone,Snowy,Stable,Sunday,20,29.4268,0,0,14.7134,0.5241
1,AquaCity,Flying Car,Solar Flare,Recession,Wednesday,2,118.8,0,0,143.5682,0.3208
2,Neuroburg,Autonomous Vehicle,Solar Flare,Recession,Wednesday,16,100.3904,0,0,91.264,0.0415
3,Ecoopolis,Drone,Clear,Booming,Thursday,8,76.8,1,0,46.0753,0.1811
4,AquaCity,Autonomous Vehicle,Solar Flare,Stable,Saturday,16,45.2176,0,0,40.1934,0.4544


In [4]:
dataset.shape

(1219567, 11)

In [5]:
# Renaming the columns
dataset = dataset.rename(columns={'Vehicle Type':'Vehicle_Type','Day Of Week':'Day_Of_Week','Is Peak Hour':'Is_Peak_Hour','Random Event Occurred':'Random_Event_Occurred', 'Economic Condition':'Economic_Condition', 'Hour Of Day':'Hour_Of_Day', 'Energy Consumption':'Energy_Consumption', 'Traffic Density':'Traffic_Density'})

dataset.head(5)

Unnamed: 0,City,Vehicle_Type,Weather,Economic_Condition,Day_Of_Week,Hour_Of_Day,Speed,Is_Peak_Hour,Random_Event_Occurred,Energy_Consumption,Traffic_Density
0,SolarisVille,Drone,Snowy,Stable,Sunday,20,29.4268,0,0,14.7134,0.5241
1,AquaCity,Flying Car,Solar Flare,Recession,Wednesday,2,118.8,0,0,143.5682,0.3208
2,Neuroburg,Autonomous Vehicle,Solar Flare,Recession,Wednesday,16,100.3904,0,0,91.264,0.0415
3,Ecoopolis,Drone,Clear,Booming,Thursday,8,76.8,1,0,46.0753,0.1811
4,AquaCity,Autonomous Vehicle,Solar Flare,Stable,Saturday,16,45.2176,0,0,40.1934,0.4544


In [7]:
# extracting the independenet variables 
x = dataset[['Vehicle_Type','Weather','Economic_Condition','Day_Of_Week','Hour_Of_Day','Speed','Is_Peak_Hour','Random_Event_Occurred','Energy_Consumption']].values
x

array([['Drone', 'Snowy', 'Stable', ..., 0, 0, 14.7134],
       ['Flying Car', 'Solar Flare', 'Recession', ..., 0, 0, 143.5682],
       ['Autonomous Vehicle', 'Solar Flare', 'Recession', ..., 0, 0,
        91.264],
       ...,
       ['Autonomous Vehicle', 'Snowy', 'Booming', ..., 0, 0, 61.8568],
       ['Autonomous Vehicle', 'Clear', 'Booming', ..., 0, 0, 85.7116],
       ['Drone', 'Electromagnetic Storm', 'Stable', ..., 0, 1, 49.5233]],
      dtype=object)

In [9]:
# extracting the dependenet variable
y = dataset[['Traffic_Density']].values
y

array([[0.5241],
       [0.3208],
       [0.0415],
       ...,
       [0.0564],
       [0.5079],
       [0.3182]])

In [10]:
# Handling the missing data

from sklearn.impute import SimpleImputer # Replace missing values using a descriptive statistic (e.g. mean, median, or most frequent) along each column, or using a constant value.

In [11]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [24]:
imputer = imputer.fit(y[:, 0:1])
y[:, 0:1] = imputer.transform(y[:, 0:1])
y

array([[0.5241],
       [0.3208],
       [0.0415],
       ...,
       [0.0564],
       [0.5079],
       [0.3182]])

In [25]:
imputer = imputer.fit(x[:, 5:10])

In [26]:
x[:, 5:10] = imputer.transform(x[:, 5:10])
x

array([['Drone', 'Snowy', 'Stable', ..., 0.0, 0.0, 14.7134],
       ['Flying Car', 'Solar Flare', 'Recession', ..., 0.0, 0.0,
        143.5682],
       ['Autonomous Vehicle', 'Solar Flare', 'Recession', ..., 0.0, 0.0,
        91.264],
       ...,
       ['Autonomous Vehicle', 'Snowy', 'Booming', ..., 0.0, 0.0, 61.8568],
       ['Autonomous Vehicle', 'Clear', 'Booming', ..., 0.0, 0.0, 85.7116],
       ['Drone', 'Electromagnetic Storm', 'Stable', ..., 0.0, 1.0,
        49.5233]], dtype=object)

In [27]:
# Converting the string data into numerical values
# And removing the missing values from the attributes which contains string values

from sklearn.preprocessing import LabelEncoder

In [28]:
label_encoder_x = LabelEncoder()

In [31]:
x[:, 0] = label_encoder_x.fit_transform(x[:, 0])
x[:, 1] = label_encoder_x.fit_transform(x[:, 1])
x[:, 2] = label_encoder_x.fit_transform(x[:, 2])
x[:, 3] = label_encoder_x.fit_transform(x[:, 3])
x[:, 4] = label_encoder_x.fit_transform(x[:, 4])
x

array([[2, 3, 2, ..., 0.0, 0.0, 14.7134],
       [3, 4, 1, ..., 0.0, 0.0, 143.5682],
       [0, 4, 1, ..., 0.0, 0.0, 91.264],
       ...,
       [0, 3, 0, ..., 0.0, 0.0, 61.8568],
       [0, 0, 0, ..., 0.0, 0.0, 85.7116],
       [2, 1, 2, ..., 0.0, 1.0, 49.5233]], dtype=object)

In [40]:
# Creating dummy variables 
# It is used to create for the string values which we have converted into the numaric values so that the ML model can understand that it is a categorical value and no other value is greater than some other value. It is in the form of (0 and 1).

# eg in the case of above dataset by converting the string value into numerical value 'Drone is 1','snoy is 2' and so on. So the ML model con understand the snoy has higher priority than Drone, which leads to inefficent results. Hence we create the dummy variables.

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

In [43]:
# in case of single attribute
# onehotencoder.fit_transform(dataset.City.values.reshape(-1,1)).toarray()

onehotencoder.fit_transform(x[:, 0:4]).toarray()  # in case of multiple attributes.

array([[1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [44]:
# Splitting the dataset for training and testing the model

from sklearn.model_selection import train_test_split


In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.7, random_state = 0)

In [46]:
x_train

array([[1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       ...,
       [0., 1., 1., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.]])

In [47]:
x_test

array([[0., 1., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       ...,
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.]])

In [48]:
y_train

array([[0.232 ],
       [0.5677],
       [0.0807],
       ...,
       [0.2229],
       [0.1337],
       [0.2276]])

In [49]:
y_test

array([[0.2827],
       [0.1855],
       [0.2825],
       ...,
       [0.3608],
       [0.0174],
       [0.2884]])

In [50]:
# Scaling features

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [52]:
x_train


array([[ 0.78221578, -0.78221578, -0.78221578, ...,  0.26247895,
         0.26247895, -0.26247895],
       [ 0.78221578, -0.78221578, -0.78221578, ...,  0.26247895,
         0.26247895, -0.26247895],
       [ 0.78221578, -0.78221578, -0.78221578, ...,  0.26247895,
         0.26247895, -0.26247895],
       ...,
       [-1.27841962,  1.27841962,  1.27841962, ...,  0.26247895,
         0.26247895, -0.26247895],
       [-1.27841962,  1.27841962,  1.27841962, ...,  0.26247895,
         0.26247895, -0.26247895],
       [-1.27841962,  1.27841962,  1.27841962, ...,  0.26247895,
         0.26247895, -0.26247895]])

In [53]:
x_test

array([[-1.27841962,  1.27841962,  1.27841962, ..., -3.80982932,
        -3.80982932,  3.80982932],
       [ 0.78221578, -0.78221578, -0.78221578, ...,  0.26247895,
         0.26247895, -0.26247895],
       [ 0.78221578, -0.78221578, -0.78221578, ...,  0.26247895,
         0.26247895, -0.26247895],
       ...,
       [ 0.78221578, -0.78221578, -0.78221578, ...,  0.26247895,
         0.26247895, -0.26247895],
       [ 0.78221578, -0.78221578, -0.78221578, ...,  0.26247895,
         0.26247895, -0.26247895],
       [-1.27841962,  1.27841962,  1.27841962, ...,  0.26247895,
         0.26247895, -0.26247895]])