In [40]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
df = pd.read_csv("C:/Users/gowth/Desktop/UCM/ML/Assignment 5/datasets/CC_GENERAL.csv")

# Drop the irrelevant columns
df = df.drop(["CUST_ID"], axis=1)

# Handle the missing values
df = df.dropna()

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Apply k-means algorithm on the original data
kmeans_orig = KMeans(n_clusters=3)
kmeans_orig.fit(scaled_data)

# Compute the silhouette score on the original data
silhouette_score_orig = silhouette_score(scaled_data, kmeans_orig.labels_)

# Perform PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Analyze the variance explained by each principal component
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)

# Apply k-means algorithm on the PCA data
kmeans = KMeans(n_clusters=3)
kmeans.fit(pca_data)

# Compute the silhouette score on the PCA data
silhouette_score_pca = silhouette_score(pca_data, kmeans.labels_)

# Compare the silhouette score
print("Silhouette score of Original Data:", silhouette_score_orig)
print("Silhouette score of k-means + PCA data:", silhouette_score_pca)
if silhouette_score_pca > silhouette_score_orig:
    print("Silhouette score improved after applying k-means on PCA data")
else:
    print("Silhouette score did not improve after applying k-means on PCA data")
    

# Perform Scaling + PCA + K-Means
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

kmeans = KMeans(n_clusters=3)
kmeans.fit(pca_data)

silhouette_score_scaled_pca_kmeans = silhouette_score(pca_data, kmeans.labels_)


# Compare the silhouette score
print("Silhouette Score with Scaling + PCA + K-Means:", silhouette_score_scaled_pca_kmeans)
print("Silhouette score of k-means + PCA data:", silhouette_score_pca)
if silhouette_score_scaled_pca_kmeans > silhouette_score_pca:
    print("Silhouette score improved after applying Scaling + PCA + K-Means")
else:
    print("Silhouette score did not improve after applying Scaling + PCA + K-Means")

Explained Variance Ratio: [0.27231177 0.20374308]
Silhouette score of Original Data: 0.24778944198785616
Silhouette score of k-means + PCA data: 0.4446068833433421
Silhouette score improved after applying k-means on PCA data
Silhouette Score with Scaling + PCA + K-Means: 0.4446069857480921
Silhouette score of k-means + PCA data: 0.4446068833433421
Silhouette score improved after applying Scaling + PCA + K-Means


In [36]:
#2. Use pd_speech_features.csv
    #a. Perform Scaling
    #b. Apply PCA (k=3)
    #c. Use SVM to report performance

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('C:/Users/gowth/Desktop/UCM/ML/Assignment 5/datasets/pd_speech_features.csv', skiprows=1)

# Split the dataset into features and target
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA with k=3
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

# Split the dataset into training and testing sets
split = int(0.8 * len(X))
X_train = X_pca[:split]
X_test = X_pca[split:]
y_train = y[:split]
y_test = y[split:]

# Train an SVM classifier on the training set
svm = SVC(gamma='scale')
svm.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = svm.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.68


In [38]:
# 3. Apply Linear Discriminant Analysis (LDA) on Iris.csv dataset to reduce dimensionality of data tok=2.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Load the dataset from a CSV file
iris_df = pd.read_csv('C:/Users/gowth/Desktop/UCM/ML/Assignment 5/datasets/Iris.csv')
X = iris_df.drop(['Id', 'Species'], axis=1).values
y = iris_df['Species'].values

print("Iris.csv data before LDA Transformation:\n", iris_df.head())

# Standardize the data
sc = StandardScaler()
X_std = sc.fit_transform(X)

# Apply LDA to the standardized data
lda = LDA(n_components=2)
X_lda = lda.fit_transform(X_std, y)

# Save the LDA-transformed data to a CSV file
iris_lda_df = pd.DataFrame(data=X_lda, columns=['LDA1', 'LDA2'])
iris_lda_df['Species'] = y
iris_lda_df.to_csv('Iris_LDA.csv', index=False)

# Display the LDA-transformed data
print("Iris.csv data after LDA Transformation:\n",iris_lda_df.head())


Iris.csv data before LDA Transformation:
    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
Iris.csv data after LDA Transformation:
        LDA1      LDA2      Species
0 -8.084953  0.328454  Iris-setosa
1 -7.147163 -0.755473  Iris-setosa
2 -7.511378 -0.238078  Iris-setosa
3 -6.837676 -0.642885  Iris-setosa
4 -8.157814  0.540639  Iris-setosa
