# 20 News Group Dataset

## Objective :  Parse, Compute Pairwaise Similarity Matrices,  Train and Test using KNN Classification Algorithm

### Loading relevant libraries 

In [21]:
#importing the relevant libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import time
import mnist 
import collections
% matplotlib inline

In [22]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [23]:
#getting the train  & test data
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset = 'test')

#### Selecting the vectorizer 

In [24]:
tfidf_vectorizer = TfidfVectorizer()
count_vectorizer = CountVectorizer()
train_tfidf_matrix = tfidf_vectorizer.fit_transform(newsgroups_train.data)
train_tf_matrix = count_vectorizer.fit_transform(newsgroups_train.data)
test_tfidf_matrix = tfidf_vectorizer.transform(newsgroups_test.data)
test_tf_matrix = count_vectorizer.transform(newsgroups_test.data)

In [25]:
print(train_tf_matrix.shape, test_tf_matrix.shape)

(11314, 130107) (7532, 130107)


In [26]:
print(train_tfidf_matrix.shape, test_tfidf_matrix.shape)

(11314, 130107) (7532, 130107)


#### Shape of the matrix

In [27]:
print("The Shape of tf sparse matrix:",train_tf_matrix.shape)
print("The Shape of tf idf sparse matrix:",train_tfidf_matrix.shape)
print("The Shape of tf sparse matrix:",test_tf_matrix.shape)
print("The Shape of tf idf sparse matrix:",test_tfidf_matrix.shape)

The Shape of tf sparse matrix: (11314, 130107)
The Shape of tf idf sparse matrix: (11314, 130107)
The Shape of tf sparse matrix: (7532, 130107)
The Shape of tf idf sparse matrix: (7532, 130107)


Each row represent the documents and the colunms are word indices. 

In tf matrix the word frequency per documents is stored in matrix. 

In tfidf matrix the tfidf (tf * idf) value of each word per document is stored in the matrix.

In [28]:
#lets look at some raw data of the sparse matrix
print(train_tf_matrix[0:2,0:10].todense())

[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


In [29]:
#lets look at some raw data of the sparse matrix
print(train_tfidf_matrix[50:51,0:100].todense())

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


Sparsity is a major challenge while dealing with text data

### Cosine Similarity for Train Data

In [30]:
#importing the relevant library
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_tf_cs = cosine_similarity(train_tf_matrix,train_tf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time for computing cosine simlarity of tf matrix in seconds:",end-start)


The total Running time for computing cosine simlarity of tf matrix in seconds: 19.37595224380493


In [32]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_tfidf_cs = cosine_similarity(train_tfidf_matrix,train_tfidf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time  for computing cosine simlarity of tfidf matrix in seconds:",end-start)


The total Running time  for computing cosine simlarity of tfidf matrix in seconds: 15.087325096130371


### Cosine Similarity for Test Data 

In [33]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_test_tf_cs = cosine_similarity(test_tf_matrix,train_tf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time for computing cosine simlarity of tf matrix in seconds:",end-start)


The total Running time for computing cosine simlarity of tf matrix in seconds: 12.637705326080322


In [34]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_test_tfidf_cs = cosine_similarity(test_tfidf_matrix,train_tfidf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time for computing cosine simlarity of tf matrix in seconds:",end-start)


The total Running time for computing cosine simlarity of tf matrix in seconds: 12.823270082473755


### Euclidean Distance for Similarity

In [35]:
#importing the relevant library
from sklearn.metrics.pairwise import euclidean_distances

In [36]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_tf_ed = euclidean_distances(train_tf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time for computing similarity by euclidean distance of tf matrix in seconds:",end-start)


The total Running time for computing similarity by euclidean distance of tf matrix in seconds: 20.618422746658325


In [37]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_tfidf_ed = euclidean_distances(train_tfidf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time for similarity by euclidean distance of tfidf matrix in seconds:",end-start)


The total Running time for similarity by euclidean distance of tfidf matrix in seconds: 23.99633550643921


### Euclidean distances for Test Data 

In [40]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_test_tf_ed = euclidean_distances(test_tf_matrix,train_tf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time for computing similarity by euclidean distance of tf matrix in seconds:",end-start)


The total Running time for computing similarity by euclidean distance of tf matrix in seconds: 15.15920352935791


In [41]:
#start time 
start = time.time()

#compute the cosine similarity of each doucment with one another
ng_test_tfidf_ed = euclidean_distances(test_tfidf_matrix,train_tfidf_matrix)

#end time 
end = time.time()

#total time 
print("\nThe total Running time for similarity by euclidean distance of tfidf matrix in seconds:",end-start)


The total Running time for similarity by euclidean distance of tfidf matrix in seconds: 14.546274185180664


### Training Accuracy

In [42]:
#accuracy on training data using cosine similarity and tfidf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_train.data)):
    similar_index =  np.argsort(ng_tfidf_cs[i])[:-(k+1):-1].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_train.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_train.data))

training accuracy 0.9165635495845855


In [43]:
#accuracy on training data using cosine similarity and tfidf vector
k = 10
sum = 0
for i in range(0,len(newsgroups_train.data)):
    similar_index =  np.argsort(ng_tfidf_cs[i])[:-(k+1):-1].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_train.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_train.data))

training accuracy 0.8680395969595192


In [44]:
#accuracy on training data using cosine similarity and tf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_train.data)):
    similar_index =  np.argsort(ng_tf_cs[i])[:-(k+1):-1].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_train.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_train.data))

training accuracy 0.8656531730599257


In [45]:
#accuracy on training data using euclidean distances and tfidf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_train.data)):
    similar_index =  np.argsort(ng_tfidf_ed[i])[:k].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_train.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_train.data))

training accuracy 0.9165635495845855


In [46]:
#accuracy on training data using euclidean distances and tf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_train.data)):
    similar_index =  np.argsort(ng_tf_ed[i])[:k].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_train.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_train.data))

training accuracy 0.8793530139649991


### Test Accuracy

In [47]:
#accuracy on test data using cosine similarity and tfidf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_test.data)):
    similar_index =  np.argsort(ng_test_tfidf_cs[i])[:-(k+1):-1].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_test.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_test.data))

training accuracy 0.6755177907594264


In [48]:
#accuracy on test data using cosine similarity and tf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_test.data)):
    similar_index =  np.argsort(ng_test_tf_cs[i])[:-(k+1):-1].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_test.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_test.data))

training accuracy 0.4305629314922995


In [49]:
#accuracy on test data using euclidean distances and tfidf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_test.data)):
    similar_index =  np.argsort(ng_test_tfidf_ed[i])[:k].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_test.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_test.data))

training accuracy 0.6755177907594264


In [50]:
#accuracy on test data using euclidean distances and tf vector
k = 5
sum = 0
for i in range(0,len(newsgroups_test.data)):
    similar_index =  np.argsort(ng_test_tf_ed[i])[:k].tolist()
    l = newsgroups_train.target[similar_index].tolist()
    label = max(l,key=l.count)
    actual_label = newsgroups_test.target[i]
    if label == actual_label:
        sum += 1
print("training accuracy",sum/len(newsgroups_test.data))

training accuracy 0.38382899628252787


In [51]:
#Print labels
similar_index = np.argsort(ng_test_tf_cs[123])[:5].tolist()
print(similar_index)
l = newsgroups_train.target[similar_index].tolist()
print(l)
for i in l :
    print(newsgroups_train.target_names[i])
label = max(l,key=l.count)
print(label)
actual_label = newsgroups_train.target[i]
print(actual_label)

[8665, 4772, 9080, 2931, 4495]
[2, 2, 2, 2, 2]
comp.os.ms-windows.misc
comp.os.ms-windows.misc
comp.os.ms-windows.misc
comp.os.ms-windows.misc
comp.os.ms-windows.misc
2
4
