# ADS ASSIGNMENT VI

Name:     Sanidhya \
SID:    22106024 \
Branch: Computer Science Engineering (Data Science)

In [1]:
import pandas as pd
import numpy

import scipy.sparse
from scipy.linalg import svd

import sklearn.datasets as datasets
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits


## Q1: Load the load_digit dataset from sklearn.datasets.
#### This dataset is made up of 1797 8x8 images. Each image, is of a hand-written digit. Therefore, each training example has 64 features (8X8) pixel values. Hence, the size of dataset is 1797X64. Implement SVD in the following two ways: 
#### (a) Step-by-step

In [2]:
# Load the dataset
digits = load_digits()
X = digits.data

# Step 1: Compute the covariance matrix
cov_matrix = numpy.cov(X.T)

# Step 2: Perform Singular Value Decomposition (SVD) on the covariance matrix
U, S, Vt = numpy.linalg.svd(cov_matrix)

# Step 3: Compute the singular values and sort them
sorted_singular_values = numpy.sort(S)[::-1]

# Step 4: Compute the cumulative sum of singular values
cumulative_sum = numpy.cumsum(sorted_singular_values)

# Step 5: Determine the number of components to keep (e.g., 95% variance retained)
threshold_variance = 0.95
total_variance = cumulative_sum[-1]
num_components = numpy.searchsorted(cumulative_sum, threshold_variance * total_variance) + 1

# Step 6: Project data onto the selected components
U_reduced = U[:, :num_components]
X_reduced = numpy.dot(X, U_reduced)

print("Original data shape:", X.shape)
print("Reduced data shape:", X_reduced.shape)


Original data shape: (1797, 64)
Reduced data shape: (1797, 29)


#### (b) Using inbuilt SVD function in python

In [3]:
# Load the dataset
digits = load_digits()
X = digits.data

# Perform Singular Value Decomposition (SVD) using numpy.linalg.svd
U, S, Vt = numpy.linalg.svd(X, full_matrices=False)

print("Original data shape:", X.shape)
print("SVD components shape:", U.shape)


Original data shape: (1797, 64)
SVD components shape: (1797, 64)


## Q2: Download the IMDB dataset of movie reviews from the following link
#### IMDB dataset having 50K movie reviews for natural language processing or Text analytics. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets.


In [4]:
# (a) Load the dataset in a dataframe.

data = pd.read_csv("IMDB Dataset.csv")
data[0:10]

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [5]:
# (b) Drop the sentiment column and consider the first 1000 reviews.

final_data = data.iloc[:1000].drop(columns=['sentiment'])
final_data.head()

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."


In [6]:
# (c) Make the corpus of first 1000 reviews.

corpus = final_data['review']
corpus

0      One of the other reviewers has mentioned that ...
1      A wonderful little production. <br /><br />The...
2      I thought this was a wonderful way to spend ti...
3      Basically there's a family where a little boy ...
4      Petter Mattei's "Love in the Time of Money" is...
                             ...                        
995    Nothing is sacred. Just ask Ernie Fosselius. T...
996    I hated it. I hate self-aware pretentious inan...
997    I usually try to be professional and construct...
998    If you like me is going to see this in a film ...
999    This is like a zoology textbook, given that it...
Name: review, Length: 1000, dtype: object

In [7]:
# (d) Convert the corpus into binary BOW vector of size mXn, where m=1000 (number of reviews
# documents) and n is the number of unique terms obtained from the 1000 documents. Each ij th entry
# of the vector is a binary value which is 1 if the jth term is present in ith review else 0.

vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(corpus)

X[0]

<1x17922 sparse matrix of type '<class 'numpy.int64'>'
	with 186 stored elements in Compressed Sparse Row format>

In [8]:
# (e) Compute the co-occurrence matrix of order nXn where each ij th entry of matrix is number of
# documents in which both i and j th terms co-occur. (Use binary co-occurrence vector to compute it).

co_occurrence_matrix = (X.T * X)
co_occurrence_matrix.setdiag(0)

In [9]:
# (f) Using Truncated SVD method of python find the reduced matrix of co-occurrence matrix with
# number of coomponents as 100.

svd = TruncatedSVD(n_components=100)
reduced_matrix = svd.fit_transform(co_occurrence_matrix)

In [10]:
# (g) The reduced matrix of order nX100 are word embeddings of n words of dimensionality 100.
# Explore more about word embeddings to learn more about it

print("Shape of reduced matrix:", reduced_matrix.shape)

Shape of reduced matrix: (17922, 100)


## Q3: Implement LDA (step-by-step) on IRIS dataset.

In [11]:
iris = datasets.load_iris()

X = iris.data
y = iris.target

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
# analyser(iris)

In [12]:
# Predict the labels for the test set
y_pred = lda.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
