<a href="https://colab.research.google.com/github/VikramMadhavSarvagyam/IT609-Assignment2/blob/main/IT609_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
def tfidf_vectorize(documents):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    return tfidf_vectorizer, tfidf_matrix

def calculate_cosine_similarity(tfidf_vectorizer, tfidf_matrix, document1, document2):
    doc1_tfidf = tfidf_vectorizer.transform([document1])
    doc2_tfidf = tfidf_vectorizer.transform([document2])
    similarity = cosine_similarity(doc1_tfidf, doc2_tfidf)
    return similarity[0][0]

def document_similarity_search(query_document, documents, tfidf_vectorizer, tfidf_matrix):
    similarities = []
    for document in documents:
        similarity = calculate_cosine_similarity(tfidf_vectorizer, tfidf_matrix, query_document, document)
        similarities.append(similarity)

    ranked_indices = np.argsort(similarities)[::-1]  # Indices of documents sorted by similarity (descending order)
    ranked_documents = [documents[i] for i in ranked_indices]
    ranked_similarities = [similarities[i] for i in ranked_indices]

    return ranked_documents[:100], ranked_similarities[:100]

documents = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')).data
tfidf_vectorizer, tfidf_matrix = tfidf_vectorize(documents)

file_name = input("Enter file name: ")
query_file = open(file_name, 'r')
file_content = query_file.read()

ranked_documents, ranked_similarities = document_similarity_search(file_content, documents, tfidf_vectorizer, tfidf_matrix)

print("Top 100 Similar Documents in sklearn's fetch_20newsgroups dataset:")
print("Document Number | Similarity Score")
print("_"*30)
for i, (doc, sim) in enumerate(zip(ranked_documents, ranked_similarities), start=1):
  if(i == 1):
      most_similar = doc
  print(f"{i}\t\t{sim:.4f}")

print('_'*30)
print("Most similar document in sklearn's fetch_20newsgroups dataset: ")
print('_'*30)
print(most_similar)


Enter file name: /content/input text file.txt
Top 100 Similar Documents in sklearn's fetch_20newsgroups dataset:
Document Number | Similarity Score
______________________________
1		0.4066
2		0.3916
3		0.3634
4		0.3591
5		0.3578
6		0.3569
7		0.3492
8		0.3294
9		0.3159
10		0.3110
11		0.3074
12		0.3019
13		0.3009
14		0.2991
15		0.2955
16		0.2925
17		0.2916
18		0.2916
19		0.2902
20		0.2897
21		0.2883
22		0.2759
23		0.2600
24		0.2595
25		0.2586
26		0.2561
27		0.2501
28		0.2472
29		0.2452
30		0.2394
31		0.2382
32		0.2370
33		0.2367
34		0.2360
35		0.2354
36		0.2329
37		0.2320
38		0.2294
39		0.2272
40		0.2260
41		0.2257
42		0.2247
43		0.2238
44		0.2226
45		0.2223
46		0.2203
47		0.2202
48		0.2185
49		0.2175
50		0.2166
51		0.2161
52		0.2158
53		0.2143
54		0.2142
55		0.2137
56		0.2137
57		0.2132
58		0.2129
59		0.2127
60		0.2125
61		0.2118
62		0.2118
63		0.2108
64		0.2089
65		0.2089
66		0.2088
67		0.2079
68		0.2078
69		0.2071
70		0.2069
71		0.2058
72		0.2057
73		0.2052
74		0.2052
75		0.2049
76		0