Installing gensim

In [None]:
!pip install --upgrade gensim
from gensim.models import Word2Vec

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/5c/4e/afe2315e08a38967f8a3036bbe7e38b428e9b7a90e823a83d0d49df1adf5/gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 152kB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3


In [None]:
# importing needed libs 
import os
import re
import nltk
import pickle
import scipy
import numpy as np
from bs4 import BeautifulSoup as bs
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# downloading needed data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Loading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! mkdir data
! cp 'drive/MyDrive/IRLAB/A3/FIRE_Dataset_EN_2010.rar' './data/FIRE_Dataset_EN_2010.rar'
! unrar x data/FIRE_Dataset_EN_2010.rar data
! tar -xvf  './data/FIRE_Dataset_EN_2010/English-Data.tgz' -C './data/FIRE_Dataset_EN_2010/'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TELEGRAPH_UTF8/2007_utf8/sports/1070225_sports_story_7438352.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070621_sports_story_7952331.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070207_sports_story_7360730.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070919_sports_story_8334184.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070218_sports_story_7407969.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070317_sports_story_7529504.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070411_sports_story_7632626.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070507_sports_story_7743729.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070318_sports_story_7533511.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070625_sports_story_7969700.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070521_sports_story_7807303.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070517_sports_story_7787900.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070814_sports_story_8191386.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070908_sports_story_8291527.utf8
TELEGRAPH_U

In [None]:
class DataReader:
  def read_and_process(self, data_dir):

    # stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    # wordnet lemmatizer
    stemmer = nltk.stem.PorterStemmer()

    file_names = []
    text_tokens = []

    i = 0
    # iterating over 2004, 2005, 2006, 2007 etc dirs
    for dir in os.listdir(data_dir):
      dir_name = os.path.join(data_dir,dir)

      # iterating over bengal, business, foreign etc dirs
      for sub_dir in os.listdir(dir_name):
        sub_dir_name = os.path.join(dir_name,sub_dir)
        data_files = os.listdir(sub_dir_name)

        for f in data_files:
          f_name = os.path.join(sub_dir_name,f)
          
          with open(f_name,'r') as fobj:
            content = fobj.read()

          soup = bs(content, "lxml")

          # find text tag
          temp_text_data = soup.find('text').text

          # converting text to lower case
          temp_text_data = temp_text_data.lower()

          # removing numbers and special chars
          temp_text_data = re.sub(r'[^\w\s]', '', temp_text_data)
          temp_text_data = re.sub(r'\d+', '', temp_text_data)

          # tokens
          tokens = nltk.word_tokenize(temp_text_data)

          # removing stopwords
          tokens = [token for token in tokens if token not in stopwords]

          # lemmatizing
          tokens = list(map(stemmer.stem,tokens))

          # removing empty files
          if len(tokens) > 0:
            text_tokens.append(tokens)
            file_names.append(f)
            
            print(i, ' - ', f)
            i += 1

    # list of tokens, list of file names
    return text_tokens, file_names

In [None]:
data_dir = "./data/FIRE_Dataset_EN_2010/TELEGRAPH_UTF8/"

dr = DataReader()
text_tokens, file_names = dr.read_and_process(data_dir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
120516  -  1061005_calcutta_story_6785681.utf8
120517  -  1060918_calcutta_story_6758534.utf8
120518  -  1060530_calcutta_story_5791882.utf8
120519  -  1061229_calcutta_tv.utf8
120520  -  1060906_calcutta_story_6703735.utf8
120521  -  1061030_calcutta_index.utf8
120522  -  1060401_calcutta_story_6030420.utf8
120523  -  1060206_calcutta_story_5793248.utf8
120524  -  1060811_calcutta_story_6529864.utf8
120525  -  1060613_calcutta_story_6343452.utf8
120526  -  1060209_calcutta_story_5821031.utf8
120527  -  1060603_calcutta_story_6306790.utf8
120528  -  1060106_calcutta_story_5684631.utf8
120529  -  1060304_calcutta_story_5923637.utf8
120530  -  1061206_calcutta_story_7097106.utf8
120531  -  1060317_calcutta_story_5973991.utf8
120532  -  1060724_calcutta_story_6494268.utf8
120533  -  1060927_calcutta_story_6797703.utf8
120534  -  1060414_calcutta_story_6096715.utf8
120535  -  1060918_calcutta_story_6751487.utf8
120536  -  106

In [None]:
# pickling the objects for future use - i.e to avoid recalculation
with open('./drive/MyDrive/IRLAB/A5/text_tokens', 'wb') as fileobj:
  pickle.dump(text_tokens, fileobj)

with open('./drive/MyDrive/IRLAB/A5/file_names', 'wb') as fileobj:
  pickle.dump(file_names, fileobj)

In [None]:
class QueryReader:

  def read_queries(self,data_dir):
    with open(data_dir,'r') as f:
      content = f.read()
    bs_content = bs(content, "lxml")

    qNum = [int(num.text) for num in bs_content.find_all('num')]

    queries = [re.sub(r'[^a-zA-Z\s]', '', desc.text.strip().lower()) for desc in bs_content.find_all('desc')]

    queries_tokens = [nltk.word_tokenize(query) for query in queries]

    queries = [' '.join(query) for query in queries_tokens]
    
    return qNum,queries_tokens

In [None]:
class QrelsReader:
  def read_qrels(self,file_path):
    qrels = [set() for _ in range(50)]
    with open(file_path) as f:
      lines = f.readlines()
    for line in lines:
      line_split = line.strip().split()
      if line_split[-1]=='1':
        qrels[int(line_split[0])-76].add(line_split[2].strip())
    return qrels

# Skipgram model

In [None]:
# train skipgram model using text tokens 
skipgram_model = Word2Vec(sentences=text_tokens, size=350, window=10, min_count=1, workers=4, sg=1, hs=1, negative=0, iter=5)

In [None]:
with open("./drive/MyDrive/IRLAB/A5/skipgram_model", "wb") as fp:   #Pickling
    pickle.dump(skipgram_model, fp)

In [None]:
with open('/content/drive/MyDrive/IR LAB/Assignment 5/data/skipgram_model','rb') as file:
  skipgram_model = pickle.load(file)

In [None]:
docs_vec = np.zeros((len(text_tokens),350))

In [None]:
for doc_no in range(len(text_tokens)):
  for token in text_tokens[doc_no]:
    docs_vec[doc_no,:] += skipgram_model.wv[token]

In [None]:
for doc_no in range(len(text_tokens)):
    docs_vec[doc_no,:] = docs_vec[doc_no,:]/len(text_tokens[doc_no])

In [None]:
# read queries
query_reader = QueryReader()
qNum,queries_tokens = query_reader.read_queries('./data/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt')
queries_tokens[0]

['reasons',
 'behind',
 'the',
 'protests',
 'by',
 'meena',
 'leaders',
 'against',
 'the',
 'inclusion',
 'of',
 'gurjars',
 'in',
 'the',
 'scheduled',
 'tribes']

In [None]:
query_vec = np.zeros((len(queries_tokens),350))
query_vec.shape

(50, 350)

In [None]:
vocab = set(skipgram_model.wv.vocab)
for q_no in range(len(queries_tokens)):
  count = 0
  for token in queries_tokens[q_no]:
    if token in vocab:
      query_vec[q_no,:] += skipgram_model.wv[token]
      count += 1
  query_vec[q_no,:] = query_vec[q_no,:]/count

In [None]:
top_10_list = []
for query_no in range(query_vec.shape[0]):
  temp_query_cosine = []
  for doc_no in range(docs_vec.shape[0]):
    cosine = cosine_similarity(query_vec[query_no].reshape(1,350),docs_vec[doc_no].reshape(1,350))
    temp_query_cosine.append((cosine,doc_no))
  temp_query_cosine.sort(reverse=True)
  top_10_list.append(temp_query_cosine[:10])
  print(query_no,end='  ')

0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  

In [None]:
file_name = './drive/MyDrive/IRLAB/A5/top_10_list_skipgram_model'
with open(file_name, 'wb') as obj:
  pickle.dump(top_10_list, obj)

In [None]:
with open('./drive/MyDrive/IRLAB/A5/top_10_list_skipgram_model','rb') as file:
  top_10_list = pickle.load(file)

In [None]:
#get relevent documents for each query
qrels_reader = QrelsReader()
qrels = qrels_reader.read_qrels("./data/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt")
print(qrels[0])

{'1070611_nation_story_7906812.utf8', '1070603_nation_story_7869357.utf8', '1070602_nation_story_7865940.utf8', '1070530_nation_story_7849973.utf8'}


In [None]:
# 0,1's for checking reliability
relavancy = np.zeros((50,10))

# check document is relevent or not
for i in range(50):
  for j in range(10):
    if file_names[top_10_list[i][j][1]] in qrels[i]:
      relavancy[i,j] = 1

# each row stores precision list of each query for top 10 documents
P = np.zeros((50,10))

# each row stores relevence list of each query for top 10 documents 
R = np.zeros((50,10)) 

# average precision for each query
AP = np.zeros(50) 

for i in range(50):
  n = 0
  for j in range(10):
    if relavancy[i,j]==1:
      n += 1
    P[i,j] = n/(j+1)
    if relavancy[i,j]==1:
      AP[i] +=  P[i,j]
    
      
  # compute AP for query
  if n == 0:
    continue;
  AP[i] /= n

  count = 0
  for j in range(10):
    if relavancy[i,j]==1:
      count += 1
    R[i,j] = count/n

# compute mAP
MAP = sum(AP)/len(AP)
print("MAP :",MAP)

MAP : 0.2769532879818594


# CBOW Model

In [None]:
CBOW_model = Word2Vec(sentences=text_tokens, size=350, window=10, min_count=1, workers=4, sg=0, hs=1, negative=0, iter=5)

In [None]:
with open("./drive/MyDrive/IRLAB/A5/CBOW_model", "wb") as fp:   #Pickling
    pickle.dump(CBOW_model, fp)

In [None]:
with open('./drive/MyDrive/IRLAB/A5/CBOW_model','rb') as file:
  CBOW_model = pickle.load(file)

In [None]:
docs_vec = np.zeros((len(text_tokens),350))

In [None]:
for doc_no in range(len(text_tokens)):
  for token in text_tokens[doc_no]:
    docs_vec[doc_no,:] += CBOW_model.wv[token]

In [None]:
for doc_no in range(len(text_tokens)):
    docs_vec[doc_no,:] = docs_vec[doc_no,:]/len(text_tokens[doc_no])

In [None]:
query_reader = QueryReader()
qNum,queries_tokens = query_reader.read_queries('./data/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt')
queries_tokens[0]

['reasons',
 'behind',
 'the',
 'protests',
 'by',
 'meena',
 'leaders',
 'against',
 'the',
 'inclusion',
 'of',
 'gurjars',
 'in',
 'the',
 'scheduled',
 'tribes']

In [None]:
query_vec = np.zeros((len(queries_tokens),350))
query_vec.shape

(50, 350)

In [None]:
vocab = set(CBOW_model.wv.vocab)
for q_no in range(len(queries_tokens)):
  count = 0
  for token in queries_tokens[q_no]:
    if token in vocab:
      query_vec[q_no,:] += CBOW_model.wv[token]
      count += 1
  query_vec[q_no,:] = query_vec[q_no,:]/count

In [None]:
top_10_list2 = []
for query_no in range(query_vec.shape[0]):
  temp_query_cosine = []
  for doc_no in range(docs_vec.shape[0]):
    cosine = cosine_similarity(query_vec[query_no].reshape(1,350),docs_vec[doc_no].reshape(1,350))
    temp_query_cosine.append((cosine,doc_no))
  temp_query_cosine.sort(reverse=True)
  top_10_list2.append(temp_query_cosine[:10])
  print(query_no,end='  ')

0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  

In [None]:
name = './drive/MyDrive/IRLAB/A5/top_10_list_CBOW_model'
with open(name, 'wb') as obj:
  pickle.dump(top_10_list2, obj)

In [None]:
# load top_10_list from disk
with open('/content/drive/MyDrive/IR LAB/Assignment 5/data/top_10_list_CBOW_model','rb') as file:
  top_10_list2 = pickle.load(file)

In [None]:
#get relevent documents for each query
qrels_reader = QrelsReader()
qrels = qrels_reader.read_qrels("./data/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt")
print(qrels[0])

{'1070611_nation_story_7906812.utf8', '1070603_nation_story_7869357.utf8', '1070602_nation_story_7865940.utf8', '1070530_nation_story_7849973.utf8'}


In [None]:
relavancy = np.zeros((50,10))

for i in range(50):
  for j in range(10):
    if file_names[top_10_list2[i][j][1]] in qrels[i]:
      relavancy[i,j] = 1

P = np.zeros((50,10)) 
R = np.zeros((50,10)) 
AP = np.zeros(50) 

for i in range(50):
  n = 0
  for j in range(10):
    if relavancy[i,j]==1:
      n += 1
    P[i,j] = n/(j+1)
    if relavancy[i,j]==1:
      AP[i] +=  P[i,j]
    
  if n == 0:
    continue;
  AP[i] /= n

  count = 0
  for j in range(10):
    if relavancy[i,j]==1:
      count += 1
    R[i,j] = count/n

# calculate mAP
MAP = sum(AP)/len(AP)
print("MAP :",MAP)

MAP : 0.29170986394557824
