# Dummy Variables and Multi-collinearity

Nipun Batra  
2024-01-27

<figure>
<a
href="https://colab.research.google.com/github/nipunbatra/ml-teaching/blob/master/notebooks/dummy-variables-multi-colinearity.ipynb"><img
src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
<figcaption>Open In Colab</figcaption>
</figure>

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np

In [2]:
x1 = np.array([1, 2, 3])
x2 = 2*x1

y = np.array([4, 6, 8])

In [3]:
all_ones = np.ones(x1.shape[0])
X = np.array([all_ones, x1, x2]).T

In [4]:
X.shape

(3, 3)

In [5]:
X

array([[1., 1., 2.],
       [1., 2., 4.],
       [1., 3., 6.]])

In [6]:
def solve_normal_equation(X, y):
    try:
        theta = np.linalg.inv(X.T @ X) @ X.T @ y
        return theta
    except np.linalg.LinAlgError:
        print('The matrix is singular')
        print("X.T @ X = \n", X.T @ X)
        return None
    
### Assignment question: Use np.linalg.solve instead of inv. Why is this better?

In [7]:
solve_normal_equation(X, y)

The matrix is singular
X.T @ X = 
 [[ 3.  6. 12.]
 [ 6. 14. 28.]
 [12. 28. 56.]]

In [8]:
np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T @ X)

(2, 2)

In [9]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

data = np.array([x1, x2]).T

lr.fit(data, y)
lr.coef_, lr.intercept_


# Assignment question: figure why sklearn is able to solve the problem

(array([0.4, 0.8]), 2.0)

In [10]:
# Regularization

eps = 1e-5
X = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
X

array([[1.00001, 1.     , 2.     ],
       [1.     , 2.00001, 4.     ],
       [1.     , 3.     , 6.00001]])

In [11]:
np.linalg.matrix_rank(X)

3

In [12]:
solve_normal_equation(X, y)

array([2.00023248, 1.19987743, 0.40001887])

In [13]:
# Drop variables
X = np.array([all_ones, x1]).T
print(X)

[[1. 1.]
 [1. 2.]
 [1. 3.]]

In [14]:
solve_normal_equation(X, y)

array([2., 2.])

In [15]:
# Dummy variables

## dataset
num_records = 12
windspeed = np.random.randint(0, 10, num_records)
vehicles = np.random.randint(100, 500, num_records)
direction = np.random.choice(['N', 'S', 'E', 'W'], num_records)
pollution = np.random.randint(0, 100, num_records)

df = pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df

In [16]:
def fit_data(df, X, y):
    try:
        lr = LinearRegression()
        lr.fit(X, y)
        rep = f"y = {lr.intercept_:0.2f}"
        for i, coef in enumerate(lr.coef_):
            rep += f" + {coef:0.2f}*{df.columns[i]}"
        return rep
    except Exception as e:
        print(e)
        return None
        

In [17]:
fit_data(df, df[df.columns[:-1]], df['pollution'])

could not convert string to float: 'W'

In [18]:
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

In [19]:
enc = OrdinalEncoder()

In [20]:
df2 = df.copy()
df2['direction'] = enc.fit_transform(df[['direction']]).flatten()
df2

In [21]:
fit_data(df2, df2[df2.columns[:-1]], df2['pollution'])

'y = 26.49 + 1.49*windspeed + 0.03*vehicles + 1.02*direction'

In [22]:
pd.Series({x: i for i, x in enumerate(enc.categories_[0])})

E    0
N    1
S    2
W    3
dtype: int64

In [23]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

In [24]:
direction_ohe = ohe.fit_transform(df[['direction']])
direction_ohe

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [25]:
col_names_ohe = [f"Is it {x}?" for x in enc.categories_[0]]

In [26]:
direction_ohe_df = pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df

In [27]:
# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]


0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
dtype: float64

In [28]:
X = np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe])

In [29]:
X

array([[  0., 355.,   0.,   0.,   0.,   1.],
       [  2., 367.,   0.,   0.,   1.,   0.],
       [  2., 447.,   0.,   0.,   1.,   0.],
       [  1., 223.,   1.,   0.,   0.,   0.],
       [  1., 272.,   0.,   0.,   1.,   0.],
       [  9., 394.,   0.,   0.,   1.,   0.],
       [  0., 333.,   0.,   1.,   0.,   0.],
       [  3., 308.,   0.,   0.,   0.,   1.],
       [  7., 480.,   0.,   1.,   0.,   0.],
       [  9., 360.,   0.,   1.,   0.,   0.],
       [  0., 125.,   0.,   0.,   1.,   0.],
       [  9., 401.,   0.,   0.,   1.,   0.]])

In [30]:
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

In [31]:
X_aug

array([[  1.,   0., 355.,   0.,   0.,   0.,   1.],
       [  1.,   2., 367.,   0.,   0.,   1.,   0.],
       [  1.,   2., 447.,   0.,   0.,   1.,   0.],
       [  1.,   1., 223.,   1.,   0.,   0.,   0.],
       [  1.,   1., 272.,   0.,   0.,   1.,   0.],
       [  1.,   9., 394.,   0.,   0.,   1.,   0.],
       [  1.,   0., 333.,   0.,   1.,   0.,   0.],
       [  1.,   3., 308.,   0.,   0.,   0.,   1.],
       [  1.,   7., 480.,   0.,   1.,   0.,   0.],
       [  1.,   9., 360.,   0.,   1.,   0.,   0.],
       [  1.,   0., 125.,   0.,   0.,   1.,   0.],
       [  1.,   9., 401.,   0.,   0.,   1.,   0.]])

In [33]:
X_aug.shape

(12, 7)

In [34]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (7, 7))

In [35]:
pd.DataFrame(X_aug.T @ X_aug)

In [36]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df[['direction']])


array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [37]:
direction_ohe_n_1 = ohe.fit_transform(df[['direction']])
col_names_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
df_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1

In [38]:
X = np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

X_aug

array([[  1.,   0., 355.,   0.,   0.,   1.],
       [  1.,   2., 367.,   0.,   1.,   0.],
       [  1.,   2., 447.,   0.,   1.,   0.],
       [  1.,   1., 223.,   0.,   0.,   0.],
       [  1.,   1., 272.,   0.,   1.,   0.],
       [  1.,   9., 394.,   0.,   1.,   0.],
       [  1.,   0., 333.,   1.,   0.,   0.],
       [  1.,   3., 308.,   0.,   0.,   1.],
       [  1.,   7., 480.,   1.,   0.,   0.],
       [  1.,   9., 360.,   1.,   0.,   0.],
       [  1.,   0., 125.,   0.,   1.,   0.],
       [  1.,   9., 401.,   0.,   1.,   0.]])

In [39]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (6, 6))

In [40]:
# Interepeting dummy variables

## dataset

X = np.array(['F', 'F', 'F', 'M', 'M'])
y = np.array([5, 5.2, 5.4, 5.8, 6])

In [41]:
from sklearn.preprocessing import LabelBinarizer
l = LabelBinarizer()
l.fit_transform(X)

array([[0],
       [0],
       [0],
       [1],
       [1]])

In [42]:
X_binary = 1 - l.fit_transform(X)

In [43]:
X_binary    

array([[1],
       [1],
       [1],
       [0],
       [0]])

In [44]:
lr = LinearRegression()
lr.fit(X_binary, y)

In [45]:
lr.coef_, lr.intercept_

(array([-0.7]), 5.8999999999999995)

In [219]:
y[(X_binary==0).flatten()].mean()

5.9

In [220]:
y[(X_binary==1).flatten()].mean()

5.2