In [1]:
# Loading Packages
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [2]:
# Displaying the different topics of train data
topics = fetch_20newsgroups(subset="train")
topics.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
# Importing both train and test data
X_train =  pd.DataFrame(fetch_20newsgroups(random_state=1,subset="train",remove=('headers', 'footers', 'quotes')).data)
X_test = pd.DataFrame(fetch_20newsgroups(random_state=1,subset="test",remove=('headers', 'footers', 'quotes')).data)

In [4]:
X_train.shape

(11314, 1)

In [5]:
X_test.shape

(7532, 1)

In [6]:
# Displaying the first 5 rows
X_train.head(5)

Unnamed: 0,0
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."


In [7]:
X_train[0][0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [8]:
X_test.head()

Unnamed: 0,0
0,: In article <34592@oasys.dt.navy.mil> odell@o...
1,Ithaca technical support can be reached at:\n\...
2,Devorski unfortunately helped to taint an othe...
3,"\nI would further add that a 486/50,S3/928,8mb..."
4,A rep at the dealer (actually it's a universit...


In [9]:
# Tf-idf
vectorizer = TfidfVectorizer(max_df = 0.7,stop_words='english')

In [10]:
# SVD
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=500, 
                          random_state=42)

In [11]:
# Building the pipeline
from sklearn.pipeline import Pipeline
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])
preprocessing_model = svd_transformer.fit(X_train[0])
svd_matrix_train = preprocessing_model.transform(X_train[0])

In [12]:
svd_matrix_train.shape

(11314, 500)

In [13]:
svd_matrix_train

array([[ 0.08690072, -0.05274129, -0.01616815, ...,  0.00982699,
         0.03411385,  0.01496907],
       [ 0.12571505, -0.03759298,  0.01870855, ...,  0.01486599,
         0.01532102, -0.00847011],
       [ 0.11543529, -0.05215308, -0.02240145, ...,  0.02234933,
        -0.01390385, -0.0074297 ],
       ...,
       [ 0.06512122, -0.01967664, -0.02061429, ..., -0.010884  ,
        -0.00857213, -0.00100804],
       [ 0.03353651,  0.02204399, -0.00068135, ..., -0.0118142 ,
        -0.01324368,  0.00595057],
       [ 0.17359084, -0.03098295, -0.04834866, ...,  0.02708178,
         0.00909409, -0.02168712]])

In [14]:
vectorizer.vocabulary_

{'sure': 86350,
 'story': 85365,
 'nad': 64707,
 'did': 33485,
 'biased': 22639,
 'disagree': 33782,
 'statement': 84947,
 'media': 60543,
 'ruin': 79344,
 'israels': 51134,
 'reputation': 77452,
 'rediculous': 76603,
 'pro': 73080,
 'israeli': 51126,
 'world': 96983,
 'having': 45783,
 'lived': 56869,
 'europe': 38092,
 'realize': 76261,
 'incidences': 49393,
 'described': 32982,
 'letter': 56182,
 'occured': 67346,
 'try': 90153,
 'ignore': 48697,
 'subsidizing': 85826,
 'existance': 38536,
 'europeans': 38094,
 'degree': 32529,
 'think': 88487,
 'reason': 76307,
 'report': 77367,
 'clearly': 27763,
 'atrocities': 20206,
 'shame': 81809,
 'austria': 20446,
 'daily': 31623,
 'reports': 77376,
 'inhuman': 49918,
 'acts': 16707,
 'commited': 28551,
 'soldiers': 83598,
 'blessing': 23136,
 'received': 76389,
 'government': 43957,
 'makes': 59163,
 'holocaust': 46998,
 'guilt': 44670,
 'away': 20732,
 'look': 57244,
 'jews': 52045,
 'treating': 89864,
 'races': 75591,
 'got': 43914,
 'pow

In [15]:
# Querying a document related to one of the document in test data
query = preprocessing_model.transform(X_test.iloc[2])

In [32]:
X_test[0][2]

"Devorski unfortunately helped to taint an otherwise brilliant display\nby MacLean.  The Canucks tied up the Jets so tightly that I thought that\nthey were mailing them.\n\nBTW, Greg...next time, don't fall asleep in geography class, it's pretty\nsad when a fellow in Norway can spell Winnipeg properly and a guy in\nNorth America can't.\n\nOne more thing...how LONG has Vancouver been in the NHL?  How many\nchampionships do they have?  \n\nOh yeah...and I CAN go to the Arena and see not one, not two, but\n*six* championship banners hanging from the rafters.  3 Stanley Cup\nbanners, and 3 Avco Cup banners.  My NHL guide says that Vancouver has\nwon the Cup once (as many times as the rockin' town of Kenora has won it!)"

In [17]:
query.shape

(1, 500)

In [18]:
query

array([[ 6.70776812e-02, -2.13308001e-02, -3.66787818e-02,
         1.35004159e-03, -5.39209418e-02, -2.84242175e-02,
         2.27992511e-02, -1.87605971e-02, -2.86120636e-02,
        -1.86597133e-02,  1.08582297e-02,  7.24349705e-03,
         1.22538605e-03,  1.19576105e-03, -9.95182588e-03,
        -5.81118918e-03, -2.10846215e-03,  5.32914320e-03,
         7.36690105e-03,  1.39568937e-02,  4.01187776e-04,
        -7.05393467e-03, -2.46277700e-03, -1.59074203e-02,
        -5.73822567e-05,  8.97679018e-03, -4.73598948e-03,
        -7.10031581e-03, -1.91122757e-02, -1.18174578e-03,
        -1.86358294e-02, -7.00502265e-03, -1.41995253e-02,
        -4.69208871e-03,  6.84998824e-03,  1.44047380e-02,
         1.63741131e-02, -9.60423187e-03, -1.94316445e-03,
         1.46567761e-02, -7.12920614e-03, -4.39824897e-03,
        -6.95460889e-03, -1.40301592e-02, -8.84213884e-03,
        -2.48615507e-02,  1.25617655e-02,  6.46663767e-03,
        -5.49384981e-03,  1.83170149e-02, -2.68150679e-0

In [19]:
# Calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
distance_matrix = cosine_similarity(svd_matrix_train,query)

In [20]:
print(distance_matrix)

[[0.01704803]
 [0.14591162]
 [0.01539193]
 ...
 [0.03049251]
 [0.01768989]
 [0.08865318]]


In [21]:
flat = distance_matrix.flatten()

In [22]:
flat

array([0.01704803, 0.14591162, 0.01539193, ..., 0.03049251, 0.01768989,
       0.08865318])

In [23]:
sort_indices = np.argsort(flat)

In [24]:
sort_indices

array([11041,  5527,   883, ...,  4275,  7731,  4931])

In [25]:
sort_indices[-5:]

array([1597, 1684, 4275, 7731, 4931])

In [26]:
# Write a  Function to return indices of top n elemets





In [41]:
distance_matrix[4931]

array([0.41983427])

In [33]:
X_test[0][2]

"Devorski unfortunately helped to taint an otherwise brilliant display\nby MacLean.  The Canucks tied up the Jets so tightly that I thought that\nthey were mailing them.\n\nBTW, Greg...next time, don't fall asleep in geography class, it's pretty\nsad when a fellow in Norway can spell Winnipeg properly and a guy in\nNorth America can't.\n\nOne more thing...how LONG has Vancouver been in the NHL?  How many\nchampionships do they have?  \n\nOh yeah...and I CAN go to the Arena and see not one, not two, but\n*six* championship banners hanging from the rafters.  3 Stanley Cup\nbanners, and 3 Avco Cup banners.  My NHL guide says that Vancouver has\nwon the Cup once (as many times as the rockin' town of Kenora has won it!)"

In [34]:
X_train[0][4931]

'Ten years ago, the number of Europeans in the NHL was roughly a quarter\nof what it is now. Going into the 1992/93 season, the numbers of Euros on\nNHL teams have escalated to the following stats:\n\nCanadians: 400\nAmericans: 100\nEuropeans: 100\n\n   Please note that these numbers are rounded off, and taken from the top\n25 players on each of the 24 teams. My source is the Vancouver Sun.\n\n   Here\'s the point: there are far too many Europeans in the NHL. I am sick\nof watching a game between an American and a Canadian team (let\'s say, the\nRed Wings and the Canucks) and seeing names like "Bure" "Konstantinov" and\n"Borshevshky". Is this North America or isn\'t it? Toronto, Detriot, Quebec,\nand Edmonton are particularly annoying, but the numbers of Euros on other\nteams is getting worse as well. \n\n    I live in Vancouver and if I hear one more word about "Pavel Bure, the\nRussian Rocket" I will completely throw up. As it is now, every time I see\nthe Canucks play I keep hoping 