#Principal Component Analysis (PCA)

#Importing all the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

#Loading the dataset

In [None]:
cu=pd.read_csv("cars-used.csv")
cu

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0
...,...,...,...,...,...,...,...,...,...
10663,A3,2020,16999,Manual,4018,Petrol,145,49.6,1.0
10664,A3,2020,16999,Manual,1978,Petrol,150,49.6,1.0
10665,A3,2020,17199,Manual,609,Petrol,150,49.6,1.0
10666,Q3,2017,19499,Automatic,8646,Petrol,150,47.9,1.4


#Plotting continues numerical features

In [None]:
cu.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0


In [None]:
cu.tail()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
10663,A3,2020,16999,Manual,4018,Petrol,145,49.6,1.0
10664,A3,2020,16999,Manual,1978,Petrol,150,49.6,1.0
10665,A3,2020,17199,Manual,609,Petrol,150,49.6,1.0
10666,Q3,2017,19499,Automatic,8646,Petrol,150,47.9,1.4
10667,Q3,2016,15999,Manual,11855,Petrol,150,47.9,1.4


In [None]:
cu.isnull()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
10663,False,False,False,False,False,False,False,False,False
10664,False,False,False,False,False,False,False,False,False
10665,False,False,False,False,False,False,False,False,False
10666,False,False,False,False,False,False,False,False,False


In [None]:
cu.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [None]:
cu.columns

Index(['model', 'year', 'price', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize'],
      dtype='object')

In [None]:
cu.items()

<generator object DataFrame.items at 0x78984767b990>

In [None]:
cu.shape

(10668, 9)

#Separate features and target variable

In [None]:
X=cu.drop(columns=['price'])
y=cu['price']

In [None]:
print(X)

      model  year transmission  mileage fuelType  tax   mpg  engineSize
0        A1  2017       Manual    15735   Petrol  150  55.4         1.4
1        A6  2016    Automatic    36203   Diesel   20  64.2         2.0
2        A1  2016       Manual    29946   Petrol   30  55.4         1.4
3        A4  2017    Automatic    25952   Diesel  145  67.3         2.0
4        A3  2019       Manual     1998   Petrol  145  49.6         1.0
...     ...   ...          ...      ...      ...  ...   ...         ...
10663    A3  2020       Manual     4018   Petrol  145  49.6         1.0
10664    A3  2020       Manual     1978   Petrol  150  49.6         1.0
10665    A3  2020       Manual      609   Petrol  150  49.6         1.0
10666    Q3  2017    Automatic     8646   Petrol  150  47.9         1.4
10667    Q3  2016       Manual    11855   Petrol  150  47.9         1.4

[10668 rows x 8 columns]


In [None]:
print(y)

0        12500
1        16500
2        11000
3        16800
4        17300
         ...  
10663    16999
10664    16999
10665    17199
10666    19499
10667    15999
Name: price, Length: 10668, dtype: int64


# Preprocessing pipeline

In [None]:
numeric_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
categorical_features = ['model', 'transmission', 'fuelType']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#TruncatedSVD Model

In [None]:
from sklearn.decomposition import TruncatedSVD

# TruncatedSVD model
svd = TruncatedSVD(n_components=2)  # You can adjust the number of components as per your requirement

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('svd', svd)])

# Fit and transform the data
X_svd = pipeline.fit_transform(X)

# Now X_svd contains the transformed data with reduced dimensions

In [None]:
X_svd

array([[ 0.26822614, -0.92763861],
       [ 1.67842597, -0.0462653 ],
       [ 1.51682039, -1.11581316],
       ...,
       [-0.70020483, -2.13801926],
       [-0.49497408, -0.7395222 ],
       [ 0.0059189 , -0.62356798]])

#Mean Centering/ Normalize data and dropping the unnecessary columns & Standardize data

In [None]:
X1=cu.drop(columns=['model','transmission','fuelType'],axis=1)
y1=cu.drop(columns=['model','transmission','fuelType'],axis=1)

In [None]:
# Create a StandardScaler instance

scaler = StandardScaler()

# Fit the scaler to the data

X_scaled = scaler.fit_transform(X1)

#Principal Component Analysis (PCA) object


In [None]:
#PCA model
pca = PCA(n_components=2)  # You can adjust the number of components as per your requirement

#Principal Component Analysis(PCA) from scratch

In [None]:
# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('pca', pca)])
# Fit and transform the data
X_reduced = pca.fit_transform(X1)

# Now X_pca contains the transformed data with reduced dimensions

In [None]:
print(X_reduced)

[[ -5503.71511322 -12667.5747791 ]
 [ 12781.52384022  -2638.30857587]
 [  8491.31229296  -9779.29388897]
 ...
 [-21342.34937223 -12786.32733557]
 [-14384.05903167  -8153.23587224]
 [-10263.37645795 -10512.81201821]]


#Compute and plotting explained in the covariance matrix in the ratio

In [None]:
# Compute covariance matrix
cov = np.cov(X_reduced.T)
cov = np.round(cov, 2)
print("Covariance matrix ", cov.shape, "\n")

Covariance matrix  (2, 2) 



#Plotting explained variance and compute the eigen vectors of the covariance matrix

In [None]:
# Perform eigen decomposition of covariance matrix
eig_val, eig_vec = np.linalg.eig(cov)
print("Eigen vectors ", eig_vec)
print("Eigen values ", eig_val, "\n")

Eigen vectors  [[1. 0.]
 [0. 1.]]
Eigen values  [5.99507776e+08 9.02275022e+07] 



In [None]:
# Sort eigen values and corresponding eigen vectors in descending order
indices = np.arange(0,len(eig_val), 1)
indices = ([x for _,x in sorted(zip(eig_val, indices))])[::-1]
eig_val = eig_val[indices]
eig_vec = eig_vec[:,indices]
print("Sorted Eigen vectors ", eig_vec)
print("Sorted Eigen values ", eig_val, "\n")

Sorted Eigen vectors  [[1. 0.]
 [0. 1.]]
Sorted Eigen values  [5.99507776e+08 9.02275022e+07] 



#Compute the explained variance and select N components and plotting cumulative explained variance

In [None]:
# Get explained variance
sum_eig_val = np.sum(eig_val)
explained_variance = eig_val/ sum_eig_val
print(explained_variance)
cumulative_variance = np.cumsum(explained_variance)
print(cumulative_variance)

[0.86918532 0.13081468]
[0.86918532 1.        ]


#Transform data using eigen vectors and covariance matrix of principal components

In [None]:
# Take transpose of eigen vectors with data
pca_data = np.dot(X_reduced, eig_vec)
print("Transformed data ", pca_data.shape)

Transformed data  (10668, 2)
