# Assignment 3 - submitted by Tarang Ranpara (202011057)

In [None]:
# mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# unzip the data
! mkdir data
! cp 'drive/MyDrive/IRLAB/A3/FIRE_Dataset_EN_2010.rar' './data/FIRE_Dataset_EN_2010.rar'
! unrar x data/FIRE_Dataset_EN_2010.rar data
! tar -xvf  './data/FIRE_Dataset_EN_2010/English-Data.tgz' -C './data/FIRE_Dataset_EN_2010/'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TELEGRAPH_UTF8/2007_utf8/sports/1070225_sports_story_7438352.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070621_sports_story_7952331.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070207_sports_story_7360730.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070919_sports_story_8334184.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070218_sports_story_7407969.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070317_sports_story_7529504.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070411_sports_story_7632626.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070507_sports_story_7743729.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070318_sports_story_7533511.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070625_sports_story_7969700.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070521_sports_story_7807303.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070517_sports_story_7787900.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070814_sports_story_8191386.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070908_sports_story_8291527.utf8
TELEGRAPH_U

In [None]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import TfidfVectorizer

# downloading essentials
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Reading and Processing Docs

In [None]:
class DataReader:
  def read_and_process(self, data_dir):

    # stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    # wordnet lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()

    file_names = []
    text_tokens = []

    i = 0
    # iterating over 2004, 2005, 2006, 2007 etc dirs
    for dir in os.listdir(data_dir):
      dir_name = os.path.join(data_dir,dir)

      # iterating over bengal, business, foreign etc dirs
      for sub_dir in os.listdir(dir_name):
        sub_dir_name = os.path.join(dir_name,sub_dir)
        data_files = os.listdir(sub_dir_name)

        for f in data_files:
          f_name = os.path.join(sub_dir_name,f)
          
          with open(f_name,'r') as fobj:
            content = fobj.read()

          soup = bs(content, "lxml")

          # find text tag
          temp_text_data = soup.find('text').text

          # converting text to lower case
          temp_text_data = temp_text_data.lower()

          # removing numbers and special chars
          temp_text_data = re.sub(r'[^\w\s]', '', temp_text_data)
          temp_text_data = re.sub(r'\d+', '', temp_text_data)

          # tokens
          tokens = nltk.word_tokenize(temp_text_data)

          # removing stopwords
          tokens = [token for token in tokens if token not in stopwords]

          # lemmatizing
          tokens = list(map(lemmatizer.lemmatize,tokens))

          # removing empty files
          if len(tokens) > 0:
            text_tokens.append(tokens)
            file_names.append(f)
            
            print(i, ' - ', f)
            i += 1

    # list of tokens, list of file names
    return text_tokens, file_names

In [None]:
# reading and preprocessing the docs 
data_dir = "./data/FIRE_Dataset_EN_2010/TELEGRAPH_UTF8/"

dr = DataReader()
text_tokens, file_names = dr.read_and_process(data_dir)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
120516  -  1051222_business_story_5631325.utf8
120517  -  1050412_business_story_4604410.utf8
120518  -  1050105_business_story_4212811.utf8
120519  -  1050617_business_story_4878814.utf8
120520  -  1051010_business_index.utf8
120521  -  1051119_business_story_5494594.utf8
120522  -  1050406_business_story_4579700.utf8
120523  -  1051018_business_story_5366372.utf8
120524  -  1050422_business_index.utf8
120525  -  1050218_business_story_4393241.utf8
120526  -  1050607_business_story_4836123.utf8
120527  -  1050409_business_story_4591030.utf8
120528  -  1050204_business_story_4335445.utf8
120529  -  1050121_business_story_4278649.utf8
120530  -  1050201_business_story_4321596.utf8
120531  -  1051017_business_story_5363505.utf8
120532  -  1051003_business_story_5312894.utf8
120533  -  1050210_business_story_4359915.utf8
120534  -  1050504_business_story_4693196.utf8
120535  -  1050924_business_index.utf8
120536  -  1050226_

In [None]:
# pickling the objects for future use - i.e to avoid recalculation
with open('text_tokens', 'wb') as fileobj:
  pickle.dump(text_tokens, fileobj)

with open('file_names', 'wb') as fileobj:
  pickle.dump(file_names, fileobj)

In [None]:
# reading pickled objects
with open('text_tokens', 'rb') as fileobj:
  text_tokens = pickle.load(fileobj)

with open('file_names', 'rb') as fileobj:
  file_names = pickle.load(fileobj)

In [None]:
# token of 0th doc
print(text_tokens[0])

['telegraph', 'calcutta', 'metro', 'tv', 'schedule', 'container', 'backgroundcolorefff', 'color', 'border', 'px', 'solid', 'cfdd', 'paddingleftem', 'paddingrightem', 'container', 'ul', 'liststyledisc', 'container', 'nolist', 'liststylenone', 'paddingleft', 'marginleft', 'link', 'backgroundcolorffffff', 'color', 'border', 'px', 'solid', 'daee', 'paddingpx', 'em', 'px', 'em', 'link', 'ul', 'liststylenone', 'paddingleft', 'marginleft', 'link', 'nolist', 'liststylenone', 'paddingleft', 'marginleft', 'margintop', 'link', 'textdecorationnone', 'color', 'link', 'ahover', 'color', 'tv', 'schedule', 'quick', 'link']


In [None]:
# name of 0th file
print(file_names[0])

1070417_calcutta_tv.utf8


# Calculating TF-IDF for docs

In [None]:
# list of dicts - each dict contains {word, TF}
token_freq = []

# unique words in corpora
unique_words = set()

for doc in text_tokens:
  words, counts = np.unique(doc, return_counts=True)
  token_freq.append(dict(zip(words,[count/len(doc) for count in counts])))
  
  for token in words:
    if token not in unique_words:
      unique_words.add(token)

print('word - freq boW Total: ', len(token_freq))
print('word - freq boW:', token_freq[0])
print('unique words:', len(unique_words))

word - freq boW Total:  125516
word - freq boW: {'ahover': 0.018518518518518517, 'backgroundcolorefff': 0.018518518518518517, 'backgroundcolorffffff': 0.018518518518518517, 'border': 0.037037037037037035, 'calcutta': 0.018518518518518517, 'cfdd': 0.018518518518518517, 'color': 0.07407407407407407, 'container': 0.05555555555555555, 'daee': 0.018518518518518517, 'em': 0.037037037037037035, 'link': 0.1111111111111111, 'liststyledisc': 0.018518518518518517, 'liststylenone': 0.05555555555555555, 'marginleft': 0.05555555555555555, 'margintop': 0.018518518518518517, 'metro': 0.018518518518518517, 'nolist': 0.037037037037037035, 'paddingleft': 0.05555555555555555, 'paddingleftem': 0.018518518518518517, 'paddingpx': 0.018518518518518517, 'paddingrightem': 0.018518518518518517, 'px': 0.05555555555555555, 'quick': 0.018518518518518517, 'schedule': 0.037037037037037035, 'solid': 0.037037037037037035, 'telegraph': 0.018518518518518517, 'textdecorationnone': 0.018518518518518517, 'tv': 0.03703703703

In [None]:
with open('unique_words', 'wb') as fileobj:
  pickle.dump(unique_words, fileobj)

In [None]:
with open('unique_words', 'rb') as fileobj:
  unique_words = pickle.load(fileobj)

In [None]:
# calculating IDF
idf = dict(zip(unique_words,[0] * len(unique_words)))

for doc in token_freq:
  for token in doc:
    idf[token] += 1
for token in idf:
  idf[token] = np.log10(len(token_freq)/idf[token])

In [None]:
with open('idf', 'wb') as fileobj:
  pickle.dump(idf, fileobj)

In [None]:
with open('idf', 'rb') as fileobj:
  idf = pickle.load(fileobj)

In [None]:
# Calculating TF-IDF 
tf_idf = []
for doc in token_freq:
  temp_tf_idf = {}
  
  for token in doc:
    temp_tf_idf[token] = doc[token]*idf[token]
  
  tf_idf.append(temp_tf_idf)

# len of tfidf
print(len(tf_idf))

# tfidf of 0th doc
print(len(tf_idf[0]))

125516
29


In [None]:
print(tf_idf[0])

{'ahover': 0.04833608100015304, 'backgroundcolorefff': 0.04833608100015304, 'backgroundcolorffffff': 0.04833608100015304, 'border': 0.057296487977092005, 'calcutta': 0.0, 'cfdd': 0.04833608100015304, 'color': 0.1657159021962719, 'container': 0.12691397193204382, 'daee': 0.04833608100015304, 'em': 0.06540210457460582, 'link': 0.17412138428622076, 'liststyledisc': 0.04833608100015304, 'liststylenone': 0.14500824300045914, 'marginleft': 0.14500824300045914, 'margintop': 0.04833608100015304, 'metro': 0.012502022129130189, 'nolist': 0.09667216200030608, 'paddingleft': 0.14500824300045914, 'paddingleftem': 0.04833608100015304, 'paddingpx': 0.04435940208129717, 'paddingrightem': 0.04833608100015304, 'px': 0.13288785045399723, 'quick': 0.030578351102943574, 'schedule': 0.05905556192478755, 'solid': 0.07087866295594251, 'telegraph': 0.0, 'textdecorationnone': 0.04833608100015304, 'tv': 0.05358307485094943, 'ul': 0.09570930880920084}


# Reading and Processing Queries

In [None]:
class QueryReader:
  
  def process(self,data_dir):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    with open(data_dir,'r') as f:
      content = f.read()
    
    soup = bs(content, "lxml")

    # extracting query num
    qNum = [int(num.text) for num in soup.find_all('num')]
    
    # using desc field of query, removing everything except alphanumeric chars and spaces
    queries = [re.sub(r'[^a-zA-Z\s]', '', desc.text.strip().lower()) for desc in soup.find_all('desc')]
    
    # tokenization 
    q_tokens = [nltk.word_tokenize(query) for query in queries]
    
    # removing stopwords 
    q_tokens = [[token for token in tokens if token not in stopwords] for tokens in q_tokens]
    
    # lemmatizing
    q_tokens = [list(map(lemmatizer.lemmatize,tokens)) for tokens in q_tokens]
    
    # list of query nums, list of tokens
    return qNum, q_tokens

In [None]:
qr = QueryReader()
qNum,queries_tokens = qr.process('./data/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt')

queries_tokens = [[token for token in tokens if token in unique_words] for tokens in queries_tokens]

print(qNum)
for q in queries_tokens:
  print(q)

[76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125]
['reason', 'behind', 'protest', 'meena', 'leader', 'inclusion', 'gurjars', 'scheduled', 'tribe']
['attack', 'hezbollah', 'guerrilla', 'indian', 'israeli', 'force']
['conflict', 'ashok', 'singhal', 'president', 'vishwa', 'hindu', 'parishad', 'lk', 'advani', 'bjp', 'leader', 'ram', 'mandir', 'issue']
['plan', 'build', 'road', 'china', 'mount', 'everest']
['initiation', 'legal', 'proceeding', 'advani', 'involvement', 'demolition', 'babri', 'masjid']
['health', 'ministry', 'india', 'made', 'certain', 'plan', 'protect', 'indian', 'child', 'outbreak', 'japanese', 'encephalitis', 'problem', 'arisen', 'course', 'implementing', 'plan']
['proposed', 'bus', 'service', 'srinagar', 'muzaffarabad', 'role', 'solving', 'indopak', 'dispute']
['attempt', 'made', 'laloo', 'prasad',

# calculating TF-IDF for queries

In [None]:
# calculating TF
query_token_freq = []
for query in queries_tokens:
  words, counts = np.unique(query, return_counts=True)
  query_token_freq.append(dict(zip(words,[count/len(query) for count in counts])))
  
print(len(query_token_freq))

50


In [None]:
# calculating TF-IDF, using IDF of docs.
query_tf_idf = []
for query in query_token_freq:
  temp = {}
  for token in query:
    temp[token] = query[token]*idf[token]
  query_tf_idf.append(temp)

In [None]:
print(query_tf_idf[0])

{'behind': 0.12483812356689106, 'gurjars': 0.5135086484210051, 'inclusion': 0.2581614256409414, 'leader': 0.10145558665848826, 'meena': 0.28877911198781164, 'protest': 0.1533644942976726, 'reason': 0.12761974922192618, 'scheduled': 0.16503112727490396, 'tribe': 0.24611529503945456}


In [None]:
# for each query it'll contain top 10 pair(cosine similarity, doc_idx)
top_doc_list = []

for query in query_tf_idf:
  temp_query_cosine = []
  i = 0
  for doc in tf_idf:
    query_dot_doc = 0
    for token in query:
      if token in doc:
        query_dot_doc += query[token]*doc[token]
    
    norm_doc = np.linalg.norm(list(doc.values()))
    norm_query = np.linalg.norm(list(query.values()))
    
    if norm_doc == 0 or norm_query == 0:
      cosine = 0
    else:
      cosine = query_dot_doc/(norm_doc*norm_query)
    
    temp_query_cosine.append((cosine,i))
    i += 1
  
  # reverse sorting
  temp_query_cosine.sort(reverse=True)

  # appending qeury results
  top_doc_list.append(temp_query_cosine[:10])

In [None]:
# tuples of (cosine similarity, doc_idx)
for i in top_doc_list:
  print(i)

[(0.1868150113140881, 109144), (0.17487854057524668, 20501), (0.17290328995369808, 53189), (0.15788529973028106, 123791), (0.15748256169491306, 22604), (0.15182242997496195, 24722), (0.15176447169192087, 29753), (0.15158375696527965, 87991), (0.14721660050168323, 78619), (0.14423302340795086, 106629)]
[(0.2840124582140067, 39550), (0.26432888634976065, 52196), (0.263651517866559, 39194), (0.25785192453741707, 39402), (0.23528006867668416, 38775), (0.23438499116803432, 39766), (0.2233617837070861, 20992), (0.21945099748664784, 40421), (0.21122142491510865, 53406), (0.20935553709657506, 39237)]
[(0.40804386421460864, 78775), (0.3971170835653605, 111224), (0.3517265615398822, 79528), (0.3179893495702512, 79892), (0.31099642953768103, 57697), (0.30683730684140315, 112607), (0.30435740630096303, 81140), (0.3024005299695303, 114942), (0.2996188215764273, 20750), (0.29827768066593874, 79700)]
[(0.5045436832369591, 18718), (0.501175754249521, 93431), (0.45705559337381174, 115878), (0.406141255

In [None]:
with open('top_doc_list', 'wb') as fileobj:
  pickle.dump(top_doc_list, fileobj)

In [None]:
file_names[57478]

'1060123_nation_story_5754114.utf8'

In [None]:
class QrelsReader:
  def process(self,file_path):
    qrels = [set() for _ in range(50)]
    with open(file_path) as f:
      lines = f.readlines()
    for line in lines:
      line_split = line.strip().split()
      if line_split[-1]=='1':
        qrels[int(line_split[0])-76].add(line_split[2].strip())
    return qrels

# reading 
qrels_reader = QrelsReader()
relevance = qrels_reader.process("./data/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt")

print(relevance[0])

{'1070603_nation_story_7869357.utf8', '1070602_nation_story_7865940.utf8', '1070530_nation_story_7849973.utf8', '1070611_nation_story_7906812.utf8'}


# calculating mean avg precision

In [None]:
# 0 if not releavant
query_doc_releavance_mat = np.zeros((50,10))

# iterating over all 50 queries
for i in range(50):

  # iterating over top 10 results
  for j in range(10):
    if file_names[top_doc_list[i][j][1]] in relevance[i]:
      query_doc_releavance_mat[i,j] = 1

In [None]:
Pr = np.zeros((50,10))
Re = np.zeros((50,10))

AP = np.zeros(50)

for i in range(50):
  n = 0
  for j in range(10):
    if query_doc_releavance_mat[i,j]==1:
      n += 1
    
    Pr[i,j] = n/(j+1)
    if query_doc_releavance_mat[i,j]==1:
      AP[i] +=  Pr[i,j]
  
  if n == 0:
    continue;
  
  AP[i] /= n
  count = 0
  
  for j in range(10):
    if query_doc_releavance_mat[i,j]==1:
      count += 1
    Re[i,j] = count/n

In [None]:
mAP = sum(AP)/len(AP)
print('mAP: ', mAP)

mAP:  0.49809102103804476
