<a href="https://colab.research.google.com/github/Swathi04/CMPE255_Dimensionality_Reduction_Assignment/blob/main/CMPE255_Dimensionality_Reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#DATA MINING ASSIGNMENT FOR DIMENSIONALITY REDUCTION

##TABULAR DATA PCA

###Necessary Imports

In [108]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

###Kaggle setup to load dataset

In [109]:
!rm -rf ~/.kaggle
!ls

sample_data


In [None]:
!pip install kaggle
from google.colab import files
files.upload()



In [None]:
!ls

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download sadeghjalalian/wine-customer-segmentation

In [None]:
!ls
!unzip wine-customer-segmentation.zip

###Load dataset

In [None]:
data_frame = pd.read_csv('Wine.csv')
data_frame.head()

In [None]:
data_frame.describe()

In [None]:
data_frame.shape

In [None]:
X_variables = data_frame.iloc[:,0:4]
X_variables

In [None]:
y_variable = data_frame.iloc[:,5]
y_variable

###Calculate the covariance matrix

In [None]:
sc = StandardScaler()
transformed_df = sc.fit_transform(X_variables)
covariance_matrix = np.cov(transformed_df.T)
covariance_matrix

###Calculate eigen values

In [None]:
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

###Calculate Eigen pairs

In [None]:
eigen_pairs = [(np.abs(eigen_values[i]), eigen_vectors[:,i]) for i in range(len(eigen_values))]
print('Eigenvalues arranged in descending order:')
for i in eigen_pairs:
    print(i[0])

###PCA

In [None]:
pca = PCA()
pca = pca.fit(transformed_df)
explained_variance = pca.explained_variance_ratio_
explained_variance

In [None]:
dataframe = pd.DataFrame({'variance':pca.explained_variance_ratio_,
             'Principal Component':['PC1','PC2','PC3','PC4']})
sns.barplot(x='Principal Component',y="variance", 
           data=dataframe, color="b");

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance')
plt.show()

In [None]:
pca_2 = PCA(n_components =2 )
pca_2 = pca_2.fit(transformed_df)
pca_2d = pca_2.transform(X_variables)

In [None]:
data_frame['Customer_Segment'] = data_frame['Customer_Segment'].replace({1:0, 2:1, 3:2})

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(pca_2d[:,0], pca_2d[:,1],c=data_frame['Customer_Segment'])
plt.show()


##TABULAR DATA SVD

###Necessary Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

###Load Dataset

In [None]:
data_frame = pd.read_csv('Wine.csv')
data_frame.head()

In [None]:
data_frame.shape

In [None]:
encoder = LabelEncoder()
for col in data_frame.columns:
    data_frame[col] = encoder.fit_transform(data_frame[col])
data_frame.head()

In [None]:
X_variables = data_frame.iloc[:,1:23]
y_label = data_frame.iloc[:, 0]

In [None]:
scaler = StandardScaler()
X_features = scaler.fit_transform(X_variables)

In [None]:
u, s, v = np.linalg.svd(X_features, full_matrices=True)

In [None]:
u.shape

In [None]:
s.shape

In [None]:
v.shape

In [None]:
variance_explained = np.round(s**2/np.sum(s**2), decimals=3)
variance_explained
 
sns.barplot(x=list(range(1,len(variance_explained)+1)),
            y=variance_explained, color="blue")
plt.xlabel('SVs', fontsize=16)
plt.ylabel('Percent of the variance explained', fontsize=15)

In [None]:
col_labels= ['SV'+str(i) for i in range(1,4)]
svd_df = pd.DataFrame(u[:,0:3], index=data_frame["Customer_Segment"].tolist(), columns=col_labels)
svd_df=svd_df.reset_index()
svd_df.rename(columns={'index':'Class'}, inplace=True)
svd_df.head()

In [None]:
svd_df['Class'] = svd_df['Class'].replace({0:'SV1', 1:'SV2', 2:'SV3'})

In [None]:
sns.scatterplot(x="SV1", y="SV2", hue="Class", data=svd_df, s=105,alpha=0.5)
plt.xlabel('SV 1: {0}%'.format(variance_explained[0]*100), fontsize=15)
plt.ylabel('SV 2: {0}%'.format(variance_explained[1]*100), fontsize=15)

##TABULAR DATA LLE

###Necessary Import

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

###Check the dimensions

In [None]:
data_frame.shape

###Specify how much dimensions needs to be reduced

In [None]:
embedding = LocallyLinearEmbedding(n_components=2).fit(data_frame)

In [None]:
embedding

###Transform the data frame to the given number of dimensions

In [None]:
lle = embedding.fit_transform(data_frame)

###Checking the new dimensions

In [None]:
lle.shape

###Visualisation of data

In [None]:
plt.figure(figsize=(10,10))
vis_x = lle[:, 0]
vis_y = lle[:, 1]
plt.scatter(vis_x, vis_y, c=data_frame['Customer_Segment'], cmap=plt.cm.get_cmap("jet", 10), marker='.')
plt.colorbar(ticks=range(10))
plt.clim(-0.5, 9.5)
plt.show()

##TABULAR DATA T-SNE

###Necessary Imports

In [None]:
from sklearn.manifold import TSNE

###Checking the dimensions of original data

In [None]:
data_frame.shape

###Specifying the dimensions to which it needs to be reduced

In [None]:
model=TSNE(n_components=2)

###Transform the data to the reduced number of dimensions

In [None]:
tsne_data=model.fit_transform(data_frame)

###Check the new dimensions

In [None]:
tsne_data.shape

###The parameter “perplexity,” which says (loosely) how to balance attention between local and global aspects of your data. The parameter is, in a sense, a guess about the number of close neighbors each point has. 
###Perplexity 100

In [None]:
model=TSNE(perplexity=100)
tsne_data=model.fit_transform(data_frame)

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(tsne_data[:,0],tsne_data[:,1], c=data_frame['Customer_Segment'])
legend1 = ax.legend(*scatter.legend_elements(), loc="upper right", title="Customer Segment")
ax.add_artist(legend1)

###Perplexity 200

In [None]:
model=TSNE(perplexity=200)
tsne_data=model.fit_transform(data_frame)

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(tsne_data[:,0],tsne_data[:,1], c=data_frame['Customer_Segment'])
legend1 = ax.legend(*scatter.legend_elements(), loc="upper right", title="Customer Segment")
ax.add_artist(legend1)

###Perplexity 10000

In [None]:
model=TSNE(perplexity=10000)
tsne_data=model.fit_transform(data_frame)

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(tsne_data[:,0],tsne_data[:,1], c=data_frame['Customer_Segment'])
legend1 = ax.legend(*scatter.legend_elements(), loc="upper right", title="Customer Segment")
ax.add_artist(legend1)

##TABULAR DATA ISOMAP

###Necessary Imports

In [None]:
from sklearn.manifold import Isomap

###Checking the original shape of data frame

In [None]:
data_frame.shape

###Applying Isomap transformation to the required number of dimensions

In [None]:
model= Isomap(n_components=2).fit_transform(data_frame)

###Plotting the graph

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(model[:,0],model[:,1], c=data_frame['Customer_Segment'])
legend1 = ax.legend(*scatter.legend_elements(),loc="upper left", title="Customer Segment")
ax.add_artist(legend1)

##TABULAR DATA UMAP

###Installing necessary libraries

In [None]:
!pip install umap-learn[plot]

###Necessary Imports

In [None]:
import umap

###Checking the dimensions of the original data frame

In [None]:
data_frame.shape

###Using UMAP to reduce the number of dimensions

In [None]:
reducer=umap.UMAP()
embedding=reducer.fit_transform(data_frame)
embedding.shape

###Plotting the reduced dimensions graph

In [None]:
plt.scatter(embedding[:,0],embedding[:,1])
plt.gca().set_aspect('equal','datalim')
plt.title('UMAP PROJECTION OF THE WINE DATA', fontsize=24)

##IMAGE DATASET PCA

###Necessary Imports

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

###Kaggle setup to load dataset

In [None]:
!kaggle competitions download -c Kannada-MNIST

In [None]:
!ls
!unzip Dig-MNIST.csv.zip

###Load dataset

In [None]:
data_frame_image = pd.read_csv('Dig-MNIST.csv')
data_frame_image.head()

In [None]:
data_frame_image.shape

In [None]:
data_frame_image.describe()

In [None]:
pca = PCA(2)
projected_value = pca.fit_transform(data_frame_image)
projected = pca.inverse_transform(projected_value)
print(data_frame_image.shape)
print(projected_value.shape)

In [None]:
image_vec = data_frame_image.drop('label',axis=1)
plt.figure(figsize=(14,14))
for digit_num in range(0,10):
    plt.subplot(1,10,digit_num+1)
    grid_data = image_vec.iloc[digit_num].to_numpy().reshape(28,28)
    plt.imshow(grid_data, interpolation = "none", cmap = "gray")
    plt.xticks([])
    plt.yticks([])
plt.show()

In [None]:
pca = PCA().fit(data_frame_image)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')

In [None]:
plt.scatter(projected_value[:, 0], projected_value[:, 1], c=data_frame_image['label'],edgecolors='none',alpha=0.5, cmap=plt.cm.get_cmap('Accent', 10))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar()

##IMAGE DATASET SVD

###Necessary Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

###Load Dataset

In [None]:
data_frame_image = pd.read_csv('Dig-MNIST.csv')
data_frame_image.head()

In [None]:
data_frame_image.shape

In [None]:
encoder = LabelEncoder()
for col in data_frame_image.columns:
    data_frame_image[col] = encoder.fit_transform(data_frame_image[col])
data_frame_image.head()

In [None]:
X_variables = data_frame_image.iloc[:,1:23]
y_label = data_frame_image.iloc[:, 0]

In [None]:
scaler = StandardScaler()
X_features = scaler.fit_transform(X_variables)

In [None]:
u, s, v = np.linalg.svd(X_features, full_matrices=True)

In [None]:
u.shape

In [None]:
s.shape

In [None]:
v.shape

In [None]:
variance_explained = np.round(s**2/np.sum(s**2), decimals=3)
variance_explained
 
sns.barplot(x=list(range(1,len(variance_explained)+1)),
            y=variance_explained, color="blue")
plt.xlabel('SVs', fontsize=16)
plt.ylabel('Percent of the variance explained', fontsize=15)

In [None]:
col_labels= ['SV'+str(i) for i in range(1,11)]
svd_df = pd.DataFrame(u[:,0:10], index=data_frame_image['label'].tolist(), columns=col_labels)
svd_df=svd_df.reset_index()
svd_df.rename(columns={'index':'Class'}, inplace=True)
svd_df.head()

In [None]:
svd_df['Class'] = svd_df['Class'].replace({0:'SV1', 1:'SV2', 2:'SV3', 3: 'SV4', 4: 'SV5', 5: 'SV6', 6: 'SV7', 7: 'SV8', 8: 'SV9', 9: 'SV10'})

In [None]:
sns.scatterplot(x="SV1", y="SV2", hue="Class", data=svd_df, s=105,alpha=0.5)
plt.xlabel('SV 1: {0}%'.format(variance_explained[0]*100), fontsize=15)
plt.ylabel('SV 2: {0}%'.format(variance_explained[1]*100), fontsize=15)

##IMAGE DATASET LLE

###Necessary Import

In [None]:
from sklearn.manifold import LocallyLinearEmbedding

###Check the dimensions

In [None]:
data_frame_image.shape

###Specify how much dimensions needs to be reduced

In [None]:
embedding = LocallyLinearEmbedding(n_components=2).fit(data_frame_image)

In [None]:
embedding

###Transform the data frame to the given number of dimensions

In [None]:
lle = embedding.fit_transform(data_frame_image)

###Checking the new dimensions

In [None]:
lle.shape

###Visualisation of data

In [None]:
plt.figure(figsize=(10,10))
vis_x = lle[:, 0]
vis_y = lle[:, 1]
plt.scatter(vis_x, vis_y, c=data_frame_image['label'], cmap=plt.cm.get_cmap("jet", 10), marker='.')
plt.colorbar(ticks=range(10))
plt.clim(-0.5, 9.5)
plt.show()

##IMAGE DATASET T-SNE

###Necessary Imports

In [None]:
from sklearn.manifold import TSNE

###Checking the dimensions of original data

In [None]:
data_frame_image.shape

###Specifying the dimensions to which it needs to be reduced

In [None]:
model=TSNE(n_components=2)

###Transform the data to the reduced number of dimensions

In [None]:
tsne_data=model.fit_transform(data_frame_image)

###Check the new dimensions

In [None]:
tsne_data.shape

###The parameter “perplexity,” which says (loosely) how to balance attention between local and global aspects of your data. The parameter is, in a sense, a guess about the number of close neighbors each point has. 
###Perplexity 100

In [None]:
model=TSNE(perplexity=100)
tsne_data=model.fit_transform(data_frame_image)

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(tsne_data[:,0],tsne_data[:,1], c=data_frame_image['label'])
legend1 = ax.legend(*scatter.legend_elements(), loc="upper right", title="Digits")
ax.add_artist(legend1)

###Perplexity 200

In [None]:
model=TSNE(perplexity=200)
tsne_data=model.fit_transform(data_frame_image)

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(tsne_data[:,0],tsne_data[:,1], c=data_frame_image['label'])
legend1 = ax.legend(*scatter.legend_elements(), loc="upper right", title="Digits")
ax.add_artist(legend1)

###Perplexity 10000

In [None]:
model=TSNE(perplexity=10000)
tsne_data=model.fit_transform(data_frame_image)

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(tsne_data[:,0],tsne_data[:,1], c=data_frame_image['label'])
legend1 = ax.legend(*scatter.legend_elements(), loc="upper right", title="Digits")
ax.add_artist(legend1)

##IMAGE DATASET ISOMAP

###Necessary Imports

In [None]:
from sklearn.manifold import Isomap

###Checking the original shape of data frame

In [None]:
data_frame_image.shape

###Applying Isomap transformation to the required number of dimensions

In [None]:
model= Isomap(n_components=2).fit_transform(data_frame_image)

###Plotting the graph

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(model[:,0],model[:,1], c=data_frame_image['label'])
legend1 = ax.legend(*scatter.legend_elements(),loc="upper left", title="Digits")
ax.add_artist(legend1)

##IMAGE DATASET UMAP

###Installing necessary libraries

In [None]:
!pip install umap-learn[plot]

###Necessary Imports

In [None]:
import umap

###Checking the dimensions of the original data frame

In [None]:
data_frame_image.shape

###Using UMAP to reduce the number of dimensions

In [None]:
reducer=umap.UMAP()
embedding=reducer.fit_transform(data_frame_image)
embedding.shape

###Plotting the reduced dimensions graph

In [None]:
plt.scatter(embedding[:,0],embedding[:,1])
plt.gca().set_aspect('equal','datalim')
plt.title('UMAP PROJECTION OF THE KANNADA DIGITS DATA', fontsize=24)