In [None]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np

In [None]:
x1 = np.array([1, 2, 3])
x2 = 2*x1

y = np.array([4, 6, 8])

In [None]:
all_ones = np.ones(x1.shape[0])
X = np.array([all_ones, x1, x2]).T

In [None]:
X.shape

(3, 3)

In [None]:
X

array([[1., 1., 2.],
       [1., 2., 4.],
       [1., 3., 6.]])

  Q 2 ) Use np.linalg.solve instead of np.linalg.inv for the same problem. Compare and contrast their usage, which one is better and why

 Ans Here we used np.linalg.solve instead of linalg.inv for the problem

 Using np.linalg.solve instead of np.linalg.inv for solving the normal equation is better here are some reasons

 1. Numerical Stability: np.linalg.solve is generally more numerically stable than np.linalg.inv. When we calculate the inverse of a matrix, we're essentially dividing by the determinant of the matrix. If the determinant is very small, this can lead to numerical instability and inaccuracies in the result. np.linalg.solve avoids this by using a different algorithm that is more stable.

 2 .Efficiency: The computational complexity of np.linalg.solve is generally lower than that of np.linalg.inv. For large matrices, this can result in significant performance improvements.

3. Memory Usage: np.linalg.inv creates a new matrix in memory, which can be problematic for large matrices due to memory constraints. np.linalg.solve operates in-place, so it doesn't require additional memory.

4. Error Handling: np.linalg.solve is designed to handle singular matrices more gracefully than np.linalg.inv. It will raise an error if the matrix is singular, which allows for better error handling and debugging.

so np.linalg.solve is better given the above points.

In [None]:
def solve_normal_equation(X, y):
    try:
        theta = np.linalg.solve(X.T @ X, X.T @ y)
        return theta
    except np.linalg.LinAlgError:
        print('The matrix is singular')
        print("X.T @ X = \n", X.T @ X)
        return None


In [None]:
solve_normal_equation(X,y)

The matrix is singular
X.T @ X = 
 [[ 3.  6. 12.]
 [ 6. 14. 28.]
 [12. 28. 56.]]


In [None]:
np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T @ X)

(2, 2)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

data = np.array([x1, x2]).T

lr.fit(data, y)
lr.coef_, lr.intercept_


# Assignment question: figure why sklearn is able to solve the problem

(array([0.4, 0.8]), 2.0)

In [None]:
# Regularization

eps = 1e-5
X = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
X

array([[1.00001, 1.     , 2.     ],
       [1.     , 2.00001, 4.     ],
       [1.     , 3.     , 6.00001]])

In [None]:
np.linalg.matrix_rank(X)

3

In [None]:
solve_normal_equation(X, y)

array([1.999972  , 1.20003383, 0.39998709])

In [None]:
# Drop variables
X = np.array([all_ones, x1]).T
print(X)

[[1. 1.]
 [1. 2.]
 [1. 3.]]


In [None]:
solve_normal_equation(X, y)

array([2., 2.])

In [None]:
# Dummy variables

## dataset
num_records = 12
windspeed = np.random.randint(0, 10, num_records)
vehicles = np.random.randint(100, 500, num_records)
direction = np.random.choice(['N', 'S', 'E', 'W'], num_records)
pollution = np.random.randint(0, 100, num_records)

df = pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df

Unnamed: 0,windspeed,vehicles,direction,pollution
0,6,438,S,12
1,9,397,W,47
2,7,158,E,12
3,8,388,N,48
4,2,471,W,39
5,2,203,S,43
6,1,358,S,25
7,3,243,W,16
8,0,101,W,91
9,5,179,W,34


In [None]:
def fit_data(df, X, y):
    try:
        lr = LinearRegression()
        lr.fit(X, y)
        rep = f"y = {lr.intercept_:0.2f}"
        for i, coef in enumerate(lr.coef_):
            rep += f" + {coef:0.2f}*{df.columns[i]}"
        return rep
    except Exception as e:
        print(e)
        return None


In [None]:
fit_data(df, df[df.columns[:-1]], df['pollution'])

could not convert string to float: 'S'


In [None]:
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

In [None]:
enc = OrdinalEncoder()

In [None]:
df2 = df.copy()
df2['direction'] = enc.fit_transform(df[['direction']]).flatten()
df2

Unnamed: 0,windspeed,vehicles,direction,pollution
0,6,438,2.0,12
1,9,397,3.0,47
2,7,158,0.0,12
3,8,388,1.0,48
4,2,471,3.0,39
5,2,203,2.0,43
6,1,358,2.0,25
7,3,243,3.0,16
8,0,101,3.0,91
9,5,179,3.0,34


In [None]:
fit_data(df2, df2[df2.columns[:-1]], df2['pollution'])

'y = 51.91 + 1.65*windspeed + -0.10*vehicles + 5.94*direction'

In [None]:
pd.Series({x: i for i, x in enumerate(enc.categories_[0])})

E    0
N    1
S    2
W    3
dtype: int64

In [None]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

In [None]:
direction_ohe = ohe.fit_transform(df[['direction']])
direction_ohe

array([[0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [None]:
col_names_ohe = [f"Is it {x}?" for x in enc.categories_[0]]

In [None]:
direction_ohe_df = pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df

Unnamed: 0,Is it E?,Is it N?,Is it S?,Is it W?
0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0
5,0.0,0.0,1.0,0.0
6,0.0,0.0,1.0,0.0
7,0.0,0.0,0.0,1.0
8,0.0,0.0,0.0,1.0
9,0.0,0.0,0.0,1.0


In [None]:
# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
dtype: float64

In [None]:
X = np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe])

In [None]:
X

array([[  6., 438.,   0.,   0.,   1.,   0.],
       [  9., 397.,   0.,   0.,   0.,   1.],
       [  7., 158.,   1.,   0.,   0.,   0.],
       [  8., 388.,   0.,   1.,   0.,   0.],
       [  2., 471.,   0.,   0.,   0.,   1.],
       [  2., 203.,   0.,   0.,   1.,   0.],
       [  1., 358.,   0.,   0.,   1.,   0.],
       [  3., 243.,   0.,   0.,   0.,   1.],
       [  0., 101.,   0.,   0.,   0.,   1.],
       [  5., 179.,   0.,   0.,   0.,   1.],
       [  9., 218.,   0.,   1.,   0.,   0.],
       [  4., 243.,   0.,   0.,   1.,   0.]])

In [None]:
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

In [None]:
X_aug

array([[  1.,   6., 438.,   0.,   0.,   1.,   0.],
       [  1.,   9., 397.,   0.,   0.,   0.,   1.],
       [  1.,   7., 158.,   1.,   0.,   0.,   0.],
       [  1.,   8., 388.,   0.,   1.,   0.,   0.],
       [  1.,   2., 471.,   0.,   0.,   0.,   1.],
       [  1.,   2., 203.,   0.,   0.,   1.,   0.],
       [  1.,   1., 358.,   0.,   0.,   1.,   0.],
       [  1.,   3., 243.,   0.,   0.,   0.,   1.],
       [  1.,   0., 101.,   0.,   0.,   0.,   1.],
       [  1.,   5., 179.,   0.,   0.,   0.,   1.],
       [  1.,   9., 218.,   0.,   1.,   0.,   0.],
       [  1.,   4., 243.,   0.,   0.,   1.,   0.]])

In [None]:
X_aug.shape

(12, 7)

In [None]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (7, 7))

In [None]:
pd.DataFrame(X_aug.T @ X_aug)

Unnamed: 0,0,1,2,3,4,5,6
0,12.0,56.0,3397.0,1.0,2.0,4.0,5.0
1,56.0,370.0,16675.0,7.0,17.0,13.0,19.0
2,3397.0,16675.0,1124039.0,158.0,606.0,1242.0,1391.0
3,1.0,7.0,158.0,1.0,0.0,0.0,0.0
4,2.0,17.0,606.0,0.0,2.0,0.0,0.0
5,4.0,13.0,1242.0,0.0,0.0,4.0,0.0
6,5.0,19.0,1391.0,0.0,0.0,0.0,5.0


In [None]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df[['direction']])

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [None]:
direction_ohe_n_1 = ohe.fit_transform(df[['direction']])
col_names_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
df_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1

Unnamed: 0,Is it N?,Is it S?,Is it W?
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,0.0,1.0,0.0
6,0.0,1.0,0.0
7,0.0,0.0,1.0
8,0.0,0.0,1.0
9,0.0,0.0,1.0


In [None]:
X = np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

X_aug

array([[  1.,   6., 438.,   0.,   1.,   0.],
       [  1.,   9., 397.,   0.,   0.,   1.],
       [  1.,   7., 158.,   0.,   0.,   0.],
       [  1.,   8., 388.,   1.,   0.,   0.],
       [  1.,   2., 471.,   0.,   0.,   1.],
       [  1.,   2., 203.,   0.,   1.,   0.],
       [  1.,   1., 358.,   0.,   1.,   0.],
       [  1.,   3., 243.,   0.,   0.,   1.],
       [  1.,   0., 101.,   0.,   0.,   1.],
       [  1.,   5., 179.,   0.,   0.,   1.],
       [  1.,   9., 218.,   1.,   0.,   0.],
       [  1.,   4., 243.,   0.,   1.,   0.]])

In [None]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (6, 6))

In [None]:
# Interepeting dummy variables

## dataset

X = np.array(['F', 'F', 'F', 'M', 'M'])
y = np.array([5, 5.2, 5.4, 5.8, 6])

In [None]:
from sklearn.preprocessing import LabelBinarizer
l = LabelBinarizer()
l.fit_transform(X)

array([[0],
       [0],
       [0],
       [1],
       [1]])

In [None]:
X_binary = 1 - l.fit_transform(X)

In [None]:
X_binary

array([[1],
       [1],
       [1],
       [0],
       [0]])

In [None]:
lr = LinearRegression()
lr.fit(X_binary, y)

In [None]:
lr.coef_, lr.intercept_

(array([-0.7]), 5.8999999999999995)

In [None]:
y[(X_binary==0).flatten()].mean()

5.9

In [None]:
y[(X_binary==1).flatten()].mean()

5.2