## Submitted by Tarang Ranpara (202011057)

In [None]:
! rm -rf pd_index 
! rm -rf data

In [None]:
!pip install python-terrier
#!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier



### unzipping data

In [None]:
! mkdir data
! cp 'drive/MyDrive/IRLAB/A3/FIRE_Dataset_EN_2010.rar' './data/FIRE_Dataset_EN_2010.rar'
! unrar x data/FIRE_Dataset_EN_2010.rar data
! tar -xvf  './data/FIRE_Dataset_EN_2010/English-Data.tgz' -C './data/FIRE_Dataset_EN_2010/'
! gzip -d ./data/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
TELEGRAPH_UTF8/2007_utf8/sports/1070225_sports_story_7438352.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070621_sports_story_7952331.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070207_sports_story_7360730.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070919_sports_story_8334184.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070218_sports_story_7407969.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070317_sports_story_7529504.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070411_sports_story_7632626.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070507_sports_story_7743729.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070318_sports_story_7533511.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070625_sports_story_7969700.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070521_sports_story_7807303.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070517_sports_story_7787900.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070814_sports_story_8191386.utf8
TELEGRAPH_UTF8/2007_utf8/sports/1070908_sports_story_8291527.utf8
TELEGRAPH_U

### importing libs

In [None]:
import pyterrier as pt
if not pt.started():
    pt.init()

PyTerrier 0.6.0 has loaded Terrier 5.5 (built by craigmacdonald on 2021-05-20 13:12)


In [None]:
import os
import re
import pickle
import numpy as np
from tqdm import tqdm
import pandas as pd
import nltk
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import TfidfVectorizer

# downloading essentials
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### performing pre-processing at string level

In [None]:
class PreProcessString:
    def __init__(self, 
                 lower=True, 
                 remove_nums_special_chars=True, 
                 lemmatization=True, 
                 remove_stop_words=True):

        self.text = None
        self.lower = lower
        self.remove_nums_special_chars = remove_nums_special_chars
        self.lemmatization = lemmatization
        self.remove_stop_words = remove_stop_words

    def setText(self, text):
        self.text = text

    def preProcess(self):
        temp_text_data = self.text 

        if self.text == None:
            raise Exception("text can not be None")

        if self.lower:
            temp_text_data = temp_text_data.lower()
        
        if self.remove_nums_special_chars:
            temp_text_data = re.sub(r'[^\w\s]', '', temp_text_data)
            temp_text_data = re.sub(r'\d+', '', temp_text_data)

        tokens = nltk.word_tokenize(temp_text_data)

        if self.remove_stop_words:
            stopwords = set(nltk.corpus.stopwords.words('english'))
            tokens = [token for token in tokens if token not in stopwords]

        if self.lemmatization:
            lemmatizer = nltk.stem.WordNetLemmatizer()
            tokens = list(map(lemmatizer.lemmatize,tokens))
        else:
            stemmer = nltk.stem.PorterStemmer()
            tokens = list(map(stemmer.stem,tokens))

        if len(tokens) > 0:
            return ' '.join(tokens)
        else:
            return ''

### reading the corpus

In [None]:
class CorporaReader:
    def __init__(self, rootDir):
        self.rootDir = rootDir
        self.preProcessor = PreProcessString()
        self.files, self.fileNames = self.listFiles()

    # returns file names with full path and list of file names 
    def listFiles(self):
        print('Indexing all the data files')
        listOfFiles = []
        listOfFileNames = []
        topDirs = os.listdir(self.rootDir)
        for topDir in topDirs:
            topDir = os.path.join(self.rootDir, topDir)
            internalDirs = os.listdir(topDir)
            for internalDir in internalDirs:
                internalDir = os.path.join(topDir, internalDir)
                for fileName in os.listdir(internalDir):
                    listOfFiles.append(os.path.join(internalDir, fileName))
                    listOfFileNames.append(fileName)

        return listOfFiles, listOfFileNames

    # returns data in the pyterrier format
    def read(self):
        fileNames, fileContents = [], []
        numFiles = len(self.files)
        for i in tqdm(range(numFiles)):
            fileName = self.files[i]
            with open(fileName,'r') as fobj:
                content = fobj.read()

            soup = bs(content, "lxml")
            temp_text_data = soup.find('text').text

            self.preProcessor.setText(temp_text_data)
            text = self.preProcessor.preProcess()

            if text != '':
                fileNames.append(self.fileNames[i])
                fileContents.append(text)

        return pd.DataFrame({'docno': fileNames, 'text': fileContents})

### reading the qrels

In [None]:
class QrelsReader:
    def __init__(self, filePath):
        self.filePath = filePath

    # returns qrels in the pyterrier format
    def read(self):
        return pt.io.read_qrels(self.filePath)

### reading the queries 

In [None]:
class QueryReader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.preProcessor = PreProcessString()

    # returns queries in the pyterrier format
    def read(self):
        with open(self.file_path,'r') as f:
            content = f.read()

        soup = bs(content, "lxml")
        qNum = [int(num.text) for num in soup.find_all('num')]
        queries = [desc.text for desc in soup.find_all('desc')]

        for i in tqdm(range(len(queries))):
            self.preProcessor.setText(queries[i])
            queries[i] = self.preProcessor.preProcess()
        
        return pd.DataFrame({'qid':qNum, 'query':queries})

### building the index

In [None]:
class IndexUtil:
    def __init__(self, 
                 pt,
                 index_loc):
        self.pt = pt 
        self.index_loc = index_loc 
        self.index_ref = None

    # builds the index from given index columns and meta columns 
    def buildIndex(self, data, index_column, meta_columns):
        pd_indexer = pt.DFIndexer(self.index_loc)
        meta_data = dict()
        for meta_column in meta_columns:
            meta_data[meta_column] = data[meta_column]

        
        self.index_ref = pd_indexer.index(data[index_column], **meta_data)

    # returns the evaluation metrics as per given queries & qrels 
    def evaluate(self, wv_models, topics, qrels, metrics):
        batchRetrieval = [self.pt.BatchRetrieve(self.index_ref, wmodel=wv_model) for wv_model in wv_models]
        return self.pt.Experiment(
          batchRetrieval,
          topics,
          qrels,
          metrics
        )

### driver code

In [None]:
# defining dataset global vars
DATASET_TOPICS = './data/FIRE_Dataset_EN_2010/en.topics.76-125.2010.txt'
DATASET_QRELS = './data/FIRE_Dataset_EN_2010/en.qrels.76-125.2010.txt'
DATASET_ROOT  = './data/FIRE_Dataset_EN_2010/TELEGRAPH_UTF8/'

In [None]:
cr = CorporaReader(DATASET_ROOT)
corpora = cr.read()
corpora.head()

Indexing all the data files


100%|██████████| 125586/125586 [08:45<00:00, 238.79it/s]


Unnamed: 0,docno,text
0,1070314_opinion_story_7505795.utf8,telegraph calcutta opinion carnival begin raju...
1,1070410_opinion_story_7625033.utf8,telegraph calcutta opinion nothing board never...
2,1070818_opinion_story_8202960.utf8,telegraph calcutta opinion straitjacket ananda...
3,1070319_opinion_story_7530650.utf8,telegraph calcutta opinion hand dirty pie suma...
4,1070530_opinion_story_7851017.utf8,telegraph calcutta opinion keep flag flying na...


In [None]:
qr = QrelsReader(DATASET_QRELS)
qrels = qr.read()
qrels.head()

Unnamed: 0,qid,docno,label
0,76,1040901_nation_story_3702283.utf8,0
1,76,1040901_opinion_story_3675790.utf8,0
2,76,1040902_nation_story_3707291.utf8,0
3,76,1040904_opinion_story_3713095.utf8,0
4,76,1040908_calcutta_story_3729202.utf8,0


In [None]:
q = QueryReader(DATASET_TOPICS)
topics = q.read()
topics.head()

100%|██████████| 50/50 [00:00<00:00, 1928.95it/s]


Unnamed: 0,qid,query
0,76,reason behind protest meena leader inclusion g...
1,77,attack hezbollah guerrilla indian israeli force
2,78,conflict ashok singhal president vishwa hindu ...
3,79,plan build road china mount everest
4,80,initiation legal proceeding advani involvement...


In [None]:
index = IndexUtil(pt, "./pd_index")

In [None]:
index.buildIndex(
    corpora, 
    'text', 
    ['docno']
)

In [None]:
index.evaluate(
    ['TF_IDF', 'BM25'], 
    topics, 
    qrels, 
    ['map','ndcg']
)

Unnamed: 0,name,map,ndcg
0,BR(TF_IDF),0.446071,0.705144
1,BR(BM25),0.445547,0.703292
