In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from string import digits, punctuation

##**Question 1.**

In [19]:
data = fetch_20newsgroups()
news = fetch_20newsgroups(subset = 'train')['data']
news[:3]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [4]:
# pre-processing function

stop_words = {"'ll", 'a', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act', 'actually', 'added', 'adj', 'affected', 'affecting', 'affects', 'after', 'afterwards', 'again', 'against', 'ah', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'announce', 'another', 'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apparently', 'approximately', 'are', 'aren', 'arent', 'arise', 'around', 'as', 'aside', 'ask', 'asking', 'at', 'auth', 'available', 'away', 'awfully', 'b', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'beginning', 'beginnings', 'begins', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'between', 'beyond', 'biol', 'both', 'brief', 'briefly', 'but', 'by', 'c', 'ca', 'came', 'can', "can't", 'cannot', 'cause', 'causes', 'certain', 'certainly', 'co', 'com', 'come', 'comes', 'contain', 'containing', 'contains', 'could', 'couldnt', 'd', 'date', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'don', "don't", 'done', 'down', 'downwards', 'due', 'during', 'e', 'each', 'ed', 'edu', 'effect', 'eg', 'eight', 'eighty', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'especially', 'et', 'et-al', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'except', 'f', 'far', 'few', 'ff', 'fifth', 'first', 'five', 'fix', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'found', 'four', 'from', 'further', 'furthermore', 'g', 'gave', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'giving', 'go', 'goes', 'gone', 'got', 'gotten', 'h', 'had', 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', 'hed', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'heres', 'hereupon', 'hers', 'herself', 'hes', 'hi', 'hid', 'him', 'himself', 'his', 'hither', 'home', 'how', 'howbeit', 'however', 'hundred', 'i', "i'll", "i've", 'id', 'ie', 'if', 'im', 'immediate', 'immediately', 'importance', 'important', 'in', 'inc', 'indeed', 'index', 'information', 'instead', 'into', 'invention', 'inward', 'is', "isn't", 'it', "it'll", 'itd', 'its', 'itself', 'j', 'just', 'k', 'keep \tkeeps', 'kept', 'kg', 'km', 'know', 'known', 'knows', 'l', 'largely', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'line', 'little', 'look', 'looking', 'looks', 'ltd', 'm', 'made', 'mainly', 'make', 'makes', 'many', 'may', 'maybe', 'me', 'mean', 'means', 'meantime', 'meanwhile', 'merely', 'mg', 'might', 'million', 'miss', 'ml', 'more', 'moreover', 'most', 'mostly', 'mr', 'mrs', 'much', 'mug', 'must', 'my', 'myself', 'n', 'na', 'name', 'namely', 'nay', 'nd', 'near', 'nearly', 'necessarily', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'ninety', 'no', 'nobody', 'non', 'none', 'nonetheless', 'noone', 'nor', 'normally', 'nos', 'not', 'noted', 'nothing', 'now', 'nowhere', 'o', 'obtain', 'obtained', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'omitted', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'ord', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'owing', 'own', 'p', 'page', 'pages', 'part', 'particular', 'particularly', 'past', 'per', 'perhaps', 'placed', 'please', 'plus', 'poorly', 'possible', 'possibly', 'potentially', 'pp', 'predominantly', 'present', 'previously', 'primarily', 'probably', 'promptly', 'proud', 'provides', 'put', 'q', 'que', 'quickly', 'quite', 'qv', 'r', 'ran', 'rather', 'rd', 're', 'readily', 'really', 'recent', 'recently', 'ref', 'refs', 'regarding', 'regardless', 'regards', 'related', 'relatively', 'research', 'respectively', 'resulted', 'resulting', 'results', 'right', 'run', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'sec', 'section', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sent', 'seven', 'several', 'shall', 'she', "she'll", 'shed', 'shes', 'should', "shouldn't", 'show', 'showed', 'shown', 'showns', 'shows', 'significant', 'significantly', 'similar', 'similarly', 'since', 'six', 'slightly', 'so', 'some', 'somebody', 'somehow', 'someone', 'somethan', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specifically', 'specified', 'specify', 'specifying', 'still', 'stop', 'strongly', 'sub', 'substantially', 'successfully', 'such', 'sufficiently', 'suggest', 'sup', 'sure', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'you', 'your', 'yours', 'yourself', 'yourselves'}
dig = tuple(digits)
punctuation = punctuation.replace("'", "").replace("_","")
def preprocess(text:str):
  text = str(text).lower() if text.__class__ is not str else text.lower()
  for pun in punctuation:
    text = text.replace(pun, " ")
  word_list = [word for word in text.split()
              if word not in stop_words
               and not word.isnumeric()
               and not word.startswith(dig)
               and not word.endswith(dig)]
  return word_list

# pre-processing the corpus:

preprocessed: list[str] = []
for document in news:
  pps = preprocess(document)
  preprocessed.append(" ".join(pps))

In [6]:
preprocessed[:3]

["lerxst wam umd where's thing subject car nntp posting host wam umd organization university maryland college park lines wondering enlighten car day door sports car looked late early called bricklin doors small addition front bumper separate rest body tellme model engine specs years production car history whatever info funky car mail thanks il brought neighborhood lerxst",
 'guykuo carson u washington guy kuo subject si clock poll final call summary final call si clock reports keywords si acceleration clock upgrade article shelley organization university washington lines nntp posting host carson u washington fair number brave souls upgraded si clock oscillator shared experiences poll send message detailing experiences procedure top speed attained cpu rated speed add cards adapters heat sinks hour usage day floppy disk functionality floppies requested summarizing two days add network knowledge base clock upgrade answered poll thanks guy kuo guykuo u washington',
 "twillis ec ecn purdue 

In [12]:
# TF-IDF matrix:

vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000)
tfidf_matrix = pd.DataFrame(vectorizer.fit_transform(news).toarray(), columns = vectorizer.get_feature_names_out())
tfidf_matrix

Unnamed: 0,00,000,01,02,03,04,0d,0t,10,100,...,write,writes,written,wrong,wrote,year,years,yes,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.116702,0.000000,0.00000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.061716,0.0,0.000000,0.133857,0.0,0.000000,0.000000,0.00000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.054855,0.0,0.000000,0.000000,0.0,0.000000,0.120463,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.085945,0.000000,0.11998,0.0
11310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0
11311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0
11312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.040713,0.0,0.088302,0.000000,0.0,0.000000,0.000000,0.00000,0.0


##**Question 2.**

In [17]:
query='university'
query_vector = vectorizer.transform([query])
query_vector.shape
X_vector = vectorizer.transform(news)

In [18]:
cosineSimilarities = cosine_similarity(X_vector,query_vector).flatten()
cosineSimilarities, len(cosineSimilarities), max(cosineSimilarities)

(array([0.07139339, 0.04888636, 0.03987126, ..., 0.        , 0.        ,
        0.        ]),
 11314,
 0.4371073805073975)

##**Question 3.**

In [20]:
related_docs_indices = cosineSimilarities.argsort()[:-10:-1]
print(related_docs_indices)

for i in related_docs_indices:
    data1 = [data.data[i]]
    print(data1)
    print(cosineSimilarities[i])

[3472 6294 5603 8282 6101 6324  194 7961 6452]
['From: swdwan@napier.uwaterloo.ca (Donald Wan)\nSubject: just testing\nOrganization: University of Waterloo\nLines: 3\n\nhello testing\n\n\n']
0.4371073805073975
['From: lynn@granitt.uio.no (Malcolm Lynn)\nSubject: Re: Sexual Proposition = Sexual Harassment?\nLines: 2\nNntp-Posting-Host: pcgeo23.uio.no\nOrganization: University of Oslo\nLines: 3\n\n\nthis is a tesrt\ns\n']
0.43309670719816157
['From: sera@zuma.UUCP (Serdar Argic)\nSubject: A moment of silence for the perpetrators of the Turkish Genocide?\nReply-To: sera@zuma.UUCP (Serdar Argic)\nDistribution: world\nLines: 115\n\nIn article <48299@sdcc12.ucsd.edu> ma170saj@sdcc14.ucsd.edu (System Operator) writes:\n\n>    April 24th is approaching, and Armenians around the world\n>are getting ready to remember the massacres of their family members\n\nCelebrating in joy the cold-blooded genocide of 2.5 million Muslim \npeople by your criminal grandparents between 1914 and 1920? Did you \nt