### **IR LAB Assignment 2**
submitted by Tarang Ranpara (202011057)

In [1]:
import os
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
import pandas as pd 
import math
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

os.chdir('./drive/MyDrive/IRLAB/A2')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


1. Perform preprocessing steps (stopword removal, stemming, text cleaning etc) on the dataset

In [2]:
class DataReader:
  def __init__(self):
    self.content = []

  def get_data(self):

    '''
      returns: list of texts from each doc
    '''
    files = os.listdir()
    for file in files:
      print(file)
      with open(file, mode='r', encoding='utf8') as data:
        text = data.read()

      soup = BeautifulSoup(text, 'lxml')
      text = soup.find('text').text
      self.content.append(text)

    return self.content

In [3]:
class PreProcessor:
  def __init__(self, content):
    self.content = content
    self.tokens = []
    self.processed_docs = []

  def get_tokens(self):
    return self.tokens

  def get_processed_docs(self):
    return self.processed_docs

  def remove_punctuations(self, text):
    return re.sub(r'[^\w\s]', '', text)

  def remove_numbers(self, text):
    return re.sub(r'\d+', '', text)

  def remove_stopwords(self, tokens):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

  def tokenize(self, text):
    return nltk.word_tokenize(text)

  def lemmatize(self, tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return list(map(lemmatizer.lemmatize, tokens))

  def stemming(self, tokens):
    ps = PorterStemmer()
    return list(map(ps.stem, tokens))

  def process(self, 
              remove_punctuations = True,
              remove_numbers = True, 
              remove_stopwords = True,
              lemmatize = True
              ):

    for text in self.content:
      #converting text to lower
      text = text.lower()

      # removing punctuations
      if remove_punctuations:
        text = self.remove_punctuations(text)
      
      # removing numbers
      if remove_numbers:
        text = self.remove_numbers(text)

      # tokenizing
      tokens = self.tokenize(text)

      # removing stop words
      if remove_stopwords:
        tokens = self.remove_stopwords(tokens)

      # performing lemmatization or stemming
      if lemmatize:
        tokens = self.lemmatize(tokens)
      else:
        tokens = self.stemming(tokens)

      # to check if doc is not empty
      if len(tokens) > 0:
        self.tokens.append(tokens)
        self.processed_docs.append(' '.join(tokens))

In [4]:
datareader = DataReader()
content = datareader.get_data()

1041113_business_story_3998966.utf8
1041123_business_story_4037653.utf8
1040930_business_story_3821784.utf8
1041211_business_story_4114567.utf8
1040901_business_story_3702421.utf8
1040929_business_story_3817146.utf8
1041206_business_story_4091717.utf8
1040925_business_story_3801611.utf8
1040911_business_story_3744317.utf8
1040928_business_story_3811977.utf8
1041006_business_story_3846696.utf8
1040919_business_story_3776028.utf8
1041029_business_story_3937760.utf8
1041209_business_story_4104959.utf8
1040929_business_story_3817124.utf8
1041225_business_story_4172137.utf8
1041012_business_story_3871731.utf8
1040929_business_story_3817129.utf8
1040928_business_story_3812742.utf8
1041005_business_story_3840358.utf8
1041010_business_story_3864297.utf8
1041125_business_story_4045942.utf8
1041104_business_story_3963244.utf8
1041001_business_story_3824749.utf8
1041211_business_story_4114562.utf8
1041115_business_story_4005402.utf8
1040916_business_story_3763553.utf8
1041117_business_story_40126

In [5]:
# preprocessing the docs - options can be chosen according to your need
preprocessor = PreProcessor(content)
preprocessor.process()

In [6]:
processed_docs = preprocessor.get_processed_docs()
processed_docs[0]

'telegraph calcutta business oil fire nigeria prepares strike london nov afp oil price rebounded today recovering ground following heavy loss nigeria geared general strike next week threatens disrupt country crude export analyst said new york main contract light sweet crude delivery december climbed cent barrel electronic trading gmt ist plunging thursday brent north sea crude december cent higher london closed day biggest reason rebound general strike planned nigeria next week said graham sharp director energy trading brokerage firm trafigura actually shut crude production think well see spike price added nigeria main oil union thursday said member would join next week planned nationwide strike warned disruption crude export meanwhile nigerian court declared stoppage due begin next tuesday illegal export million barrel per day nigeria africa biggest oil producer sixth largest globally sweet light crude ideal refining petrol supply per cent u oil need'

In [7]:
# generating tokens
tokens = preprocessor.get_tokens()
tokens[0][:10]

['telegraph',
 'calcutta',
 'business',
 'oil',
 'fire',
 'nigeria',
 'prepares',
 'strike',
 'london',
 'nov']

2. Implement TF-IDF approach for the dataset. Show the size of the term-document matrix.


In [8]:
class TFIDF: 

  def __init__(self):
    self.corpus = None
    self.wordset = set()
    self.tf = []
    self.idf = []
    self.tfidf = []
    self.wordOcc = []

  def calcTF(self, wordDict, N):
    tfDict = {}
    for word, count in wordDict.items():
      tfDict[word] = count/float(N)
    return tfDict

  def calcIDF(self, docList):
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
      for word, val in doc.items():
        if val > 0:
          idfDict[word] += 1
    
    for word, val in idfDict.items():
      idfDict[word] = math.log10(N / float(val))
        
    return idfDict

  def calcTFIDF(self, tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
      tfidf[word] = val*idfs[word]
    return tfidf

  def fit_transform(self, corpus):
    self.corpus = corpus

    all_tokens = []
    for tokens in corpus:
      all_tokens.extend(tokens)
    self.wordset = set(all_tokens)

    empty_dict = dict.fromkeys(self.wordset, 0)
    
    for doc in self.corpus:
      wordDict = empty_dict.copy() 
      for token in doc:
        wordDict[token] += 1

      self.tf.append(self.calcTF(wordDict, len(doc)))
      self.wordOcc.append(wordDict)

    self.idf = self.calcIDF(self.wordOcc)

    for tf_row in self.tf:
      self.tfidf.append(self.calcTFIDF(tf_row, self.idf))

    return self.tfidf

  def transform(self, corpus):
    res = []
    empty_dict = dict.fromkeys(self.wordset, 0)
    for sentence_tokens in corpus:
      word_dict = empty_dict.copy()
      for token in sentence_tokens:
        if token in empty_dict:
          word_dict[token] += 1

      tf = self.calcTF(word_dict, len(word_dict))
      tfidf = self.calcTFIDF(tf, self.idf)
      res.append(tfidf)

    return res

In [9]:
# fitting our data in our TFIDF Model
tfidf = TFIDF()
res = tfidf.fit_transform(tokens)

In [10]:
res = pd.DataFrame(res)
res.head()

Unnamed: 0,bubble,criminal,derisk,seventysix,blizzard,diminutive,deaf,provided,appealed,decelerating,connected,umeha,proprietorship,robustness,ocean,offload,franchiseeowned,bios,fourweek,describes,bachchan,catching,stability,snag,seventhlargest,nifty,newsprint,diplomat,illogical,disciplinary,pension,ppf,winching,he,longdistance,girding,anirban,story,breath,reasonably,...,symphony,lingerie,catalyse,benchmark,ignites,benjamin,questionable,valuing,unwittingly,cosy,sibal,deploy,tate,defined,swift,mahendra,chatkal,albany,db,metro,inoculates,linjeflyg,gmg,declined,homogenous,waved,discounted,pertussis,ramanadurg,vashishta,manchala,pfrda,ninety,sixmember,udyog,problemsolving,systemlevel,apple,intensely,midseason
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.020145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# checking transform method
transformed_custom = tfidf.transform(tokens[:5])
transformed_custom = pd.DataFrame(transformed_custom)
transformed_custom

Unnamed: 0,bubble,criminal,derisk,seventysix,blizzard,diminutive,deaf,provided,appealed,decelerating,connected,umeha,proprietorship,robustness,ocean,offload,franchiseeowned,bios,fourweek,describes,bachchan,catching,stability,snag,seventhlargest,nifty,newsprint,diplomat,illogical,disciplinary,pension,ppf,winching,he,longdistance,girding,anirban,story,breath,reasonably,...,symphony,lingerie,catalyse,benchmark,ignites,benjamin,questionable,valuing,unwittingly,cosy,sibal,deploy,tate,defined,swift,mahendra,chatkal,albany,db,metro,inoculates,linjeflyg,gmg,declined,homogenous,waved,discounted,pertussis,ramanadurg,vashishta,manchala,pfrda,ninety,sixmember,udyog,problemsolving,systemlevel,apple,intensely,midseason
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# shape of document term matrix
print(f'Shape of document term matrix: {res.shape}')

Shape of document term: (2114, 21470)


3. Use TF-IDF vectorizer from sklearn library to generate TF-IDF for your documents. Show the size of the
term-document matrix

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_sklearn = TfidfVectorizer()
doc_term_matrix = tfidf_sklearn.fit_transform(processed_docs)
print(f'Features: {tfidf_sklearn.get_feature_names()[:15]}..\n')

print('shape of matrix:', doc_term_matrix.shape)

Features: ['aa', 'aaa', 'aaaind', 'aaarated', 'aaastable', 'aad', 'aai', 'aaifr', 'aalayance', 'aamir', 'aarohi', 'aaron', 'aarvee', 'aastable', 'aazmao']..

shape of matrix: (2114, 21449)


4. Pick the first five documents from the list and show the top five words representing each document along
with their TF-IDF scores

In [14]:
transformed = tfidf_sklearn.transform(processed_docs[:5])
transformed = pd.DataFrame(transformed.toarray(), columns=tfidf_sklearn.get_feature_names())

print('Sklearn TFIDF')
for i in range(5):
  row = transformed.loc[i].sort_values()[-5:]
  print(f'Doc {i}')
  print(row)
  print('-'*45)

Sklearn TFIDF
Doc 0
sweet      0.170530
oil        0.208035
strike     0.294208
crude      0.318931
nigeria    0.394305
Name: 0, dtype: float64
---------------------------------------------
Doc 1
excess     0.195870
farm       0.201883
deposit    0.202464
rate       0.347997
lending    0.474590
Name: 1, dtype: float64
---------------------------------------------
Doc 2
electricity    0.237761
parikh         0.242412
sayeed         0.278628
free           0.379615
power          0.420856
Name: 2, dtype: float64
---------------------------------------------
Doc 3
civil            0.167553
modernisation    0.191388
patel            0.191388
aviation         0.207680
airport          0.569435
Name: 3, dtype: float64
---------------------------------------------
Doc 4
scientist    0.156097
research     0.195599
biocon       0.267847
novartis     0.463008
syngene      0.537103
Name: 4, dtype: float64
---------------------------------------------


5. Show these words for your approach as well as using TF-IDF vectorizer.


In [15]:
# printing top 5 words from top 5 docs
# using our custom model
print('custom TFIDF')
for i in range(5):
  row = transformed_custom.loc[i].sort_values()[-5:]
  print(f'Doc {i}')
  print(row)
  print('-'*45)

custom TFIDF
Doc 0
oil        0.000196
sweet      0.000206
crude      0.000335
strike     0.000342
nigeria    0.000466
Name: 0, dtype: float64
---------------------------------------------
Doc 1
cushioning    0.000155
excess        0.000159
farm          0.000165
rate          0.000211
lending       0.000381
Name: 1, dtype: float64
---------------------------------------------
Doc 2
electricity    0.000375
parikh         0.000465
sayeed         0.000489
power          0.000536
free           0.000577
Name: 2, dtype: float64
---------------------------------------------
Doc 3
civil            0.000236
modernisation    0.000280
patel            0.000280
aviation         0.000287
airport          0.000808
Name: 3, dtype: float64
---------------------------------------------
Doc 4
scientist    0.000319
research     0.000328
biocon       0.000552
novartis     0.000979
syngene      0.001239
Name: 4, dtype: float64
---------------------------------------------


**Observation**: values in our custom TFIDF table look comparatively very small compared to sklearn TFIDF values. we can bring them between [0,1] by following method. 



In [16]:
columns = res.columns
arr = np.array(res.values)
arr = (arr - arr.min()) / (arr.max() - arr.min())

res = pd.DataFrame(arr, columns=columns)

# printing top 5 values from first 5 docs, as we did in our custom model
# as we can see, values got larger but the order remained same. 
for i in range(5):
  row = res.loc[i].sort_values()[-5:]
  print(f'Doc {i}')
  print(row)
  print('-'*45)

Doc 0
oil        0.113887
sweet      0.119597
crude      0.194392
strike     0.198367
nigeria    0.270830
Name: 0, dtype: float64
---------------------------------------------
Doc 1
cushioning    0.150226
excess        0.153778
farm          0.159827
rate          0.204199
lending       0.369321
Name: 1, dtype: float64
---------------------------------------------
Doc 2
electricity    0.143041
parikh         0.177350
sayeed         0.186759
power          0.204706
free           0.220129
Name: 2, dtype: float64
---------------------------------------------
Doc 3
civil            0.092365
modernisation    0.109359
patel            0.109359
aviation         0.112097
airport          0.315599
Name: 3, dtype: float64
---------------------------------------------
Doc 4
scientist    0.124104
research     0.127694
biocon       0.214734
novartis     0.380566
syngene      0.481858
Name: 4, dtype: float64
---------------------------------------------
