# 1. Principal Component Analysis

In [1]:
from sklearn.decomposition import PCA # importing PCA from sklearn.decomposition 
from sklearn.preprocessing import StandardScaler # importing StandardScaler from sklearn.preprocessing
from sklearn.model_selection import train_test_split # importing train_test_split from sklearn.model_selection
import pandas as pd # importing pandas as pd
# importing accuracy_score,classification_report from sklearn.metrics
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
import matplotlib.pyplot as plt # importing matplotlib.pyplot as plt
import warnings # importing warinings
warnings.filterwarnings('ignore') # to ignore all the warnings

In [2]:
cc = pd.read_csv('CC.csv') # reading CC.csv and storing it as cc
cc.fillna(cc.mean(),axis=0,inplace=True) # replacing all the NaN values with means of respective columns

In [3]:
X = cc.drop(columns=['TENURE','CUST_ID']) # dropping 'TENURE','CUST_ID' columns and storing in X
y = cc['TENURE'] # considering 'TENURE' as storing in y

### a. Apply PCA on CC dataset.

In [4]:
pca2 = PCA(n_components=2) # creating PCA with no of components is 2
principalComponents = pca2.fit_transform(X) # giving the data to PCA
# Creating new dataframe from the result obtained from PCA
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
finalcc = pd.concat([principalDf, cc[['TENURE']]], axis = 1) # concatinating the PCA values and target values
finalcc.head() # displaying first 5 rows of finalcc

Unnamed: 0,principal component 1,principal component 2,TENURE
0,-4326.383979,921.566882,12
1,4118.916665,-2432.846346,12
2,1497.907641,-1997.578694,12
3,1394.548536,-1488.743453,12
4,-3743.351896,757.342657,12


### b. Apply k-means algorithm on the PCA result and report your observation if the silhouette score has improved or not?

In [5]:
X_pca = finalcc.drop('TENURE',axis=1) # dropping 'TENURE' from finalcc and storing as X_pca
y_pca = finalcc['TENURE'] # considering 'TENURE' and storing as y_pca
nclusters = 3 # taking no of clusters as 3

### Silhoutette score without PCA

In [6]:
from sklearn.cluster import KMeans # importing KMeans from sklearn.cluster
km = KMeans(n_clusters=nclusters) # creating model
km.fit(X) # feeding data to the model
y_pred = km.predict(X) # predicting y values from X values
from sklearn.metrics import silhouette_score # importing silhoutte_score from sklearn.metrics
score = silhouette_score(X, y_pred) # finding the silhouette_score
print(score) # printing silhouette_score

0.4654853574746247


### Silhoutette score with PCA

In [7]:
km = KMeans(n_clusters=nclusters) # creating model
km.fit(X_pca) # feeding data to the model
y_pred = km.predict(X_pca) # predicting y values from x_pca values
from sklearn.metrics import silhouette_score # importing silhoutte_score from sklearn.metrics
score = silhouette_score(X_pca, y_pred) # finding the silhouette_score
print(score) # printing silhouette_score

0.5720003159007088


### Silhoutette score after scaling and applying PCA

In [8]:
scaler = StandardScaler() # creating a StandardScalar to scale the data
scaler.fit(X) # feeding the data to the scalar
X_scaled_array = scaler.transform(X) # scaling the data using Standard Scalar function
X_scaled = pd.DataFrame(X_scaled_array, columns = X.columns) # creating a new dataframe with scaled data

pca2 = PCA(n_components=2) # creating PCA with no of components is 2
principalComponents = pca2.fit_transform(X_scaled) # giving the data to PCA
# Creating new dataframe from the result obtained from PCA
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
finalcc = pd.concat([principalDf, cc[['TENURE']]], axis = 1) # concatinating the PCA values and target values
X_pca = finalcc.drop('TENURE',axis=1) # dropping 'TENURE' from finalcc and storing as X_pca
y_pca = finalcc['TENURE'] # considering 'TENURE' and storing as y_pca

km = KMeans(n_clusters=nclusters) # creating the model
km.fit(X_pca) # feeding the model with scaled data
y_cluster_kmeans = km.predict(X_pca) # predicting the y values with scaled values of x
from sklearn.metrics import silhouette_score # importing silhoutte_score from sklearn.metrics
score = silhouette_score(X_pca, y_cluster_kmeans) # finding the silhoutte_score
print(score) # printing slihoutte_score

0.4533711725391876


# 2. Use pd_speech_features.csv

In [9]:
speech = pd.read_csv('pd_speech_features.csv') # reading pd_speech_features.csv and storing it as speech
# pd.isna(speech).any().value_counts()
X = speech.drop(columns = ['class']) # dropping class column and storing it as X
y = speech['class'] # considering class column and storing as y

### a. Perform Scaling

In [10]:
scaler = StandardScaler() # creating a StandardScalar to scale the data
scaler.fit(X) # feeding the data to the scalar
X_scaled_array = scaler.transform(X) # scaling the data using Standard Scalar function
X_scaled = pd.DataFrame(X_scaled_array, columns = X.columns) # creating a new dataframe with scaled data

### b. Apply PCA (k=3)

In [11]:
pca3 = PCA(n_components=3) # creating PCA with no of components is 2
principalComponents = pca3.fit_transform(X_scaled) # giving the data to PCA
# Creating new dataframe from the result obtained from PCA
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2','principal component 3'])
finalspeech = pd.concat([principalDf, speech[['class']]], axis = 1) # concatinating the PCA values and target values
finalspeech.head() # displaying first 5 rows of finalspeech

Unnamed: 0,principal component 1,principal component 2,principal component 3,class
0,-10.047372,1.471075,-6.846405,1
1,-10.637725,1.583749,-6.830976,1
2,-13.516185,-1.253543,-6.818699,1
3,-9.155084,8.833597,15.290903,1
4,-6.76447,4.611464,15.637121,1


### c. Use SVM to report performance

In [12]:
X_pca = finalspeech.drop('class',axis=1) # dropping class column and storing it as X_pca
y_pca = finalspeech['class'] # considering class column and storing as y
# splitting the given datasets into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_pca, test_size = 0.2, random_state = 0)

In [13]:
from sklearn.svm import SVC # importing SVC 
classifier = SVC() # creating the model
classifier.fit(X_train, y_train) # feeding model with training dataset
y_pred = classifier.predict(X_test) # predicting the dependent variable in the test dataset
print(classification_report(y_test, y_pred)) # printing the classification report
print(confusion_matrix(y_test, y_pred)) # printing the confusion matrix
# Accuracy score
from sklearn.metrics import accuracy_score # importing the accuracy_score
print('accuracy is',accuracy_score(y_pred,y_test)) # printing accuracy_score

              precision    recall  f1-score   support

           0       0.67      0.42      0.52        38
           1       0.83      0.93      0.88       114

    accuracy                           0.80       152
   macro avg       0.75      0.68      0.70       152
weighted avg       0.79      0.80      0.79       152

[[ 16  22]
 [  8 106]]
accuracy is 0.8026315789473685


# 3. Apply Linear Discriminant Analysis (LDA) on Iris.csv dataset to reduce dimensionality of data to k=2

In [14]:
iris = pd.read_csv('Iris.csv') # reading 'Irsi.csv' and storing it as iris
# iris.head() 
X = iris.drop(columns=['Species','Id']) # dropping 'Species','Id' columns from iris and storing as X
y = iris['Species'] # considering 'Species' column and storing as y

In [15]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # importing LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2) # creating lda
principalComponents = lda.fit_transform(X,y) # giving the data to lda
# Creating new dataframe from the result obtained from PCA
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

finaliris = pd.concat([principalDf, iris[['Species']]], axis = 1) # concatinating the lda values and target values
finaliris.head() # displaying first rows for finaliris

Unnamed: 0,principal component 1,principal component 2,Species
0,8.084953,0.328454,Iris-setosa
1,7.147163,-0.755473,Iris-setosa
2,7.511378,-0.238078,Iris-setosa
3,6.837676,-0.642885,Iris-setosa
4,8.157814,0.540639,Iris-setosa


# 4. Briefly identify the difference between PCA and LDA

Difference between PCA adn LDA is written in word document. Please kindly refer there