In [1]:
import re
import glob
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def retrieve_docs_and_clean():
    documents=[]
    for file in glob.glob('Dataset'+"//*.csv"):
        text = ""
        with open(file,'r',encoding='latin-1') as nf:
            items = nf.readlines()
            items = items[1:]
            for item in items:
                row_text = item.split(',',6)
                text=row_text[6]
        documents.append(text)
    # Clean Paragraphs
    documents_clean = []
    for d in documents:
        # Remove Unicode
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        # Remove Mentions
        document_test = re.sub(r'@\w+', '', document_test)
        # Lowercase the document
        document_test = document_test.lower()
        # Remove punctuations
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        # Lowercase the numbers
        document_test = re.sub(r'[0-9]', '', document_test)
        # Remove the doubled space
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(document_test)

    return documents_clean

In [3]:
docs = retrieve_docs_and_clean()
docs


[' i think it will try to repeal obamacare i think it will try to repeal obamaca re without any i think it will try to repeal obamacare without any idea of how to cover the million americans who have been added and who have gotten security by it i think it will try to ta ke security by it i think it will try to take america out of the climate change chords without any idea of what to do about serious issues ',
 ' gecko oh dear vo geico fifteen minutes could save you or more on car insurance i d say it s taken us for a ride honestly what thanks do we owe progress we re up to our necks in landfill and down to the wire in resources and climate change is out to get us ',
 ' reason why we need to continue china s massive renewables programme was one sign of the world s determination to carry on tackling climate change china says it will show leadership if the us does pull out of the un climate deal but even the us itself has a boom in wind and solar power the election of president trump is 

In [4]:
# Create Term-Document Matrix with TF-IDF weighting

vectorizer = TfidfVectorizer()# Instantiate a TfidfVectorizer object
X = vectorizer.fit_transform(docs)# fit the data and transform it as a vector

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())# Convert the X as transposed matrix and Create a DataFrame and set the vocabulary as the index
print(df.head())
print(df.shape)

                  0    1    2    3         4        5    6         7
about      0.070131  0.0  0.0  0.0  0.126598  0.00000  0.0  0.000000
added      0.083681  0.0  0.0  0.0  0.000000  0.00000  0.0  0.000000
address    0.000000  0.0  0.0  0.0  0.000000  0.00000  0.0  0.128429
advising   0.000000  0.0  0.0  0.0  0.302116  0.00000  0.0  0.000000
affecting  0.000000  0.0  0.0  0.0  0.000000  0.11938  0.0  0.000000
(246, 8)


In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
about,0.070131,0.000000,0.000000,0.00000,0.126598,0.000000,0.0,0.000000
added,0.083681,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.000000
address,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.128429
advising,0.000000,0.000000,0.000000,0.00000,0.302116,0.000000,0.0,0.000000
affecting,0.000000,0.000000,0.000000,0.00000,0.000000,0.119380,0.0,0.000000
...,...,...,...,...,...,...,...,...
working,0.000000,0.000000,0.000000,0.28872,0.000000,0.000000,0.0,0.000000
world,0.000000,0.000000,0.119764,0.00000,0.000000,0.100050,0.0,0.000000
would,0.000000,0.000000,0.000000,0.00000,0.151058,0.000000,0.0,0.000000
you,0.000000,0.117861,0.000000,0.00000,0.218488,0.086335,0.0,0.000000


In [9]:
#Calculate the similarity using cosine similarity.
# def get_similar_articles(q, df):
#     print("query:", q)
#     # Convert the query to a vector
#     q = [q]
#     q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
#     sim = {}
#     # Calculate the similarity
#     for i in range(10):
#         sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
#     sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    
#     # Print the articles and their similarity values
#     for k, v in sim_sorted:
#         if v != 0.0:
#             print("Similarity value:", v)
#             print(docs[k])
#             print()


# q1 = 'you'
# get_similar_articles(q1, df)
# print('-'*100)

import numpy as np
def get_similar_articles(q, df):
  print("query:", q)
  print("Here are the articles with the highest  similarity values: ")
  # Convert the query become a vector
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  # Calculate the similarity
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  # Sort the values 
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  # Print the articles and their similarity values
  for k, v in sim_sorted:
    if v != 0.0:
      print("Similaritas:", v)
      print(documents_clean[k])
      print()
# Add The Query
q1 = input()
# Call the function
get_similar_articles(q1, df)


about
query: about
Here are the articles with the highest  similarity values: 


KeyError: ignored