# AIM :- Singular Value Decomposition

Dimensionality reduction using Singular Value Decomposition.

# Packages Used

In [None]:
# Importing necessary libraries

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity as cs

# Importing data from json file

In [None]:
df = pd.read_json('Department of Justice 2009-2018 Press Releases.json', lines=True)
df.head()

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13087 entries, 0 to 13086
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12810 non-null  object
 1   title       13087 non-null  object
 2   contents    13087 non-null  object
 3   date        13087 non-null  object
 4   topics      13087 non-null  object
 5   components  13087 non-null  object
dtypes: object(6)
memory usage: 613.6+ KB


# Data Pre-processing

In [None]:
def toLower(sentence):
    return sentence.lower()

def tokenizer(sentence):
    tokens = list(set(nltk.word_tokenize(sentence)))
    return tokens

def stopwords_removal(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.extend([',','?','""',"''",'.','!', "'",'"',"'d","'ll",'[',']','--',':',';','///','@', '``',
                       '#', '$', '%', '&', "'re", "'s", '(', ')', '*', '**', '**the', '-', '/', '//',
                       '§', '§§','...','–', '—', '‘', '’', '“', '”', '•', '─',"'m", "'ve", '***'])
    filtered_tokens = [i for i in tokens if not i in stop_words]
    return filtered_tokens

def stemming(tokens):
    stemmer = nltk.stem.porter.PorterStemmer()
    stemmed_tokens = [stemmer.stem(i) for i in tokens]
    return stemmed_tokens

def pre_process(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = stopwords_removal(tokens)
    stems = stemming(tokens)
    return stems

# Creating TF-IDF vector

In [None]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_features=1000,min_df = 5,max_df = 0.95,tokenizer=pre_process)

tfidf_matrix = tfidf_vectorizer.fit_transform(df.contents[:1000]) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

(1000, 1000)


In [None]:
terms = tfidf_vectorizer.get_feature_names()
print(terms[500:600])

['iraq', 'isi', 'island', 'issu', 'item', 'j.', 'jackson', 'jame', 'jan.', 'januari', 'jason', 'jeff', 'jersey', 'john', 'johnson', 'join', 'joint', 'joseph', 'jr.', 'judg', 'judgment', 'juli', 'june', 'juri', 'jurisdict', 'justic', 'juvenil', 'k.', 'keep', 'kenneth', 'key', 'kickback', 'kill', 'know', 'known', 'l.', 'la', 'labor', 'land', 'lanni', 'larg', 'largest', 'last', 'later', 'launch', 'launder', 'law', 'lawsuit', 'lead', 'leader', 'leadership', 'learn', 'least', 'led', 'legal', 'legisl', 'lesli', 'level', 'life', 'like', 'limit', 'line', 'list', 'litig', 'live', 'lo', 'local', 'locat', 'long', 'look', 'loretta', 'loss', 'lynch', 'm.', 'made', 'mail', 'maintain', 'major', 'make', 'man', 'manag', 'mandatori', 'mani', 'manufactur', 'march', 'mari', 'mark', 'market', 'marshal', 'maryland', 'mass', 'massachusett', 'materi', 'matter', 'matthew', 'maximum', 'may', 'mean', 'media', 'medic']


# Implementing SVD for dimension reduction

In [None]:
print(type(tfidf_matrix))

<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
A = tfidf_matrix[:]
print(A.toarray())
print(A.shape)

[[0.         0.         0.         ... 0.         0.03361153 0.        ]
 [0.0575137  0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.03660433 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(1000, 1000)


In [None]:
U, s, VT = svds(A,k=500)
print(U)
print()
print(s)
print()
print(VT)

[[-0.03207787 -0.00141767  0.03337429 ... -0.02455407 -0.02166643
   0.02733934]
 [-0.00155395  0.02390624 -0.00220712 ...  0.01506205 -0.01897435
   0.02452758]
 [ 0.0359787  -0.03712263 -0.08938989 ...  0.03070949 -0.02230946
   0.02034995]
 ...
 [-0.01652088 -0.05701718 -0.01534954 ...  0.00170475 -0.03007071
   0.02276049]
 [-0.0066499  -0.0219085  -0.00069967 ... -0.00393887 -0.02607862
   0.02186175]
 [-0.00616041  0.02154413 -0.04202505 ...  0.00649058 -0.02787535
   0.01996485]]

[ 0.48165118  0.48274497  0.48561584  0.48706087  0.48733693  0.48875606
  0.48959054  0.49158655  0.49423379  0.49520584  0.49565786  0.49598863
  0.49794388  0.49892476  0.50112651  0.5025861   0.50381426  0.5069102
  0.50783039  0.50852032  0.50920669  0.50972133  0.51165689  0.51426271
  0.51490325  0.51669964  0.5171164   0.51868817  0.52145026  0.52229675
  0.52396516  0.52440278  0.52636889  0.52766542  0.52903957  0.53009376
  0.5323325   0.53338839  0.53418718  0.53522491  0.53784     0.539490

In [None]:
print(U.shape,s.shape,VT.shape)

(1000, 500) (500,) (500, 1000)


In [None]:
svd = TruncatedSVD(n_components=500)
svd.fit(A)
result = svd.transform(A)
print(result)

[[ 0.30003669 -0.16095952  0.11879024 ...  0.03291008  0.01354934
  -0.01580087]
 [ 0.26917892 -0.14096012 -0.07286874 ... -0.00452893 -0.00675109
  -0.00701817]
 [ 0.22333133 -0.16573662 -0.14856959 ... -0.00443479 -0.01413006
   0.0137724 ]
 ...
 [ 0.24978592 -0.22339477 -0.0082474  ...  0.00672511 -0.00157481
  -0.0123192 ]
 [ 0.23992262 -0.19373762  0.01905586 ... -0.01274408 -0.03863373
  -0.00073805]
 [ 0.219105   -0.20708548 -0.03140082 ...  0.00077568 -0.01969546
   0.0059317 ]]


In [None]:
print(result.shape)

(1000, 500)


# Concept Space

In [None]:
# Concept space
k=5

U_k = U[:,:k]
V_k = VT[:k,:]
result_k = result[:k,:k]

print(U_k.shape)
print(V_k.shape)
print(result_k.shape)

(1000, 5)
(5, 1000)
(5, 5)


In [None]:
query = ['Department of Justice']
query_matrix = tfidf_vectorizer.transform(query).toarray()
print(query_matrix.shape)
# print(query_matrix)

(1, 1000)


In [None]:
inverse = np.linalg.inv(result_k)
print(inverse.shape)
q_t = query_matrix[:]
q_k = np.dot(np.dot(q_t,U_k),inverse)
print(q_k.shape)

(5, 5)
(1, 5)


In [None]:
V_k_final = np.transpose(V_k)
print(V_k_final.shape)

(1000, 5)


In [None]:
similarity = []
for x in range(1000):
    similarity.extend(cs([V_k_final[x]],q_k))

In [None]:
similarity.sort()
print('Highest similarity',similarity[-1])
print('Lowest similarity',similarity[0])

Highest similarity [0.9748728]
Lowest similarity [-0.99144246]


# Learning Outcomes

    1. In this practical, we learned how we can use Dimensality reduction using SVD
    2. We learned to turn matrix into lower dimension using soncept space
    3. Get to know that TF-IDF gives sparse metrix
    4. Get to know the working of SVD
    5. Using dimnesionality reduction techniques like PCA, SVD we can increase performance of our IR system
    6. Through SVD performnce based on retrieval time is increased by nearly 4 times compared to Tf-IDF matrix.