Downloading Dataset using !gdown

In [77]:
!gdown --id 1VR1bO9D3f3606BBYGWaS5Dt2cXfnzASU

Downloading...
From: https://drive.google.com/uc?id=1VR1bO9D3f3606BBYGWaS5Dt2cXfnzASU
To: /content/business.zip
0.00B [00:00, ?B/s]2.98MB [00:00, 92.7MB/s]


Unzipping the zipped file

In [86]:
!unzip /content/business.zip

Archive:  /content/business.zip
replace business/1040901_business_index.utf8? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: business/1040901_business_index.utf8  
replace business/1040901_business_story_3700171.utf8? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: business/1040901_business_story_3700171.utf8  
  inflating: business/1040901_business_story_3700827.utf8  
  inflating: business/1040901_business_story_3701515.utf8  
  inflating: business/1040901_business_story_3701518.utf8  
  inflating: business/1040901_business_story_3701887.utf8  
  inflating: business/1040901_business_story_3701938.utf8  
  inflating: business/1040901_business_story_3701940.utf8  
  inflating: business/1040901_business_story_3702060.utf8  
  inflating: business/1040901_business_story_3702108.utf8  
  inflating: business/1040901_business_story_3702110.utf8  
  inflating: business/1040901_business_story_3702411.utf8  
  inflating: business/1040901_business_story_3702412.utf8  
  inflating: bus

Importing necessary libraries and packages

In [87]:
import os
import glob
from bs4 import BeautifulSoup
import lxml.html
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import bisect
import math
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

Downloading NLTK packages

In [88]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Extracting data from corpus using BeautifulSoup and performing pre-processing steps

In [89]:
files = [] # List is storing file name and file content. File content is a list of words.
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer() 
postag = nltk.corpus.wordnet

# Returns Part of Speech for lemmatization
def wordnet_pos(word):
    t = nltk.pos_tag([word])[0][1][0].lower()
    t_pos = {"j": postag.ADJ, "n": postag.NOUN, "v": postag.VERB, "r": postag.ADV}
    return t_pos.get(t, postag.NOUN)

for file_obj in glob.glob(os.path.join(path, "*.utf8")) :
    temp_file = []
    markup = (file_obj)
    # Extracting data and saving it to raw_text
    soup = BeautifulSoup(open(markup, "r").read())
    raw_text = soup.find('text').get_text()
    for sym in raw_text : 
      # Removing punctuation
      if sym in punc : 
        raw_text = raw_text.replace(sym, "")

    # Removing non-alphabetic text
    words = [word for word in raw_text.split() if word.isalpha()]
    text = ' '.join(words)
    # Forming token
    tokens = word_tokenize(text)
    # Converting text to lower-case
    words = [word.lower() for word in words]
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # Performing lemmatization
    lemmatized = [lemmatizer.lemmatize(word, wordnet_pos(word)) for word in words]

    # Storing filename to list
    temp_file.append(str(markup))
    # Storing file data to list
    temp_file.append(lemmatized)
    # Appending data to 'files' list
    files.append(temp_file)

Extracting terms and calculating IDF for these terms.

In [90]:
# Storing unique words in a set
terms = set()
idf = []

# Adding words to terms set
for curr_file in files : 
  temp_set = set(curr_file[1])
  terms = terms.union(temp_set)

# Converting set to sorted list
term_index = sorted(list(terms))

# Initializing document frequency (df) matrix
df = [0 for x in range(len(term_index))]
for curr_file in files : 
  temp_set = set(curr_file[1])
  for term in temp_set :
    # Searching for the index of term in term_index list using binary search (bisect)
    # Incrementing document frequency value for returned index in df list
    df[bisect.bisect_left(term_index, term)] = df[bisect.bisect_left(term_index, term)] + 1
    
# Calculating IDF using df values
for doc in df : 
  idf.append(math.log(len(files)/doc))

Calculating TF-IDF matrix

In [91]:
tf_idf = []
sparse_tf_idf = []
sparse_size = 0

# Calculating TF-IDF vectors for each file
for doc in files :
  # Initializing TF vector with size equal to term_index size 
  tf = [0 for x in range(len(term_index))]
  # Incrementing count for each term
  for term in doc[1] : 
    tf[bisect.bisect_left(term_index, term)] = tf[bisect.bisect_left(term_index, term)] + 1 
  for i in range(len(tf)) : 
    # Calculating TF-IDF value
    tf[i] = tf[i]*idf[i]
  tf_idf.append(tf)
print("Size of TF-IDF matrix using own approach is:", len(tf_idf)*len(tf_idf[0]))

# Generating Sparse matrix using Dense TF-IDF matrix
for row in tf_idf : 
  tf = []
  for i in range(len(row)) : 
    if row[i] != 0 : 
      tf.append([i,row[i]])
  sparse_tf_idf.append(tf)

# Calculating size of Sparse TF-IDF matrix generated using own approach
for row in sparse_tf_idf : 
  for val in row : 
    sparse_size += 1
print("Size of Sparse TF-IDF matrix using own approach is:", sparse_size)

Size of TF-IDF matrix using own approach is: 40500135
Size of Sparse TF-IDF matrix using own approach is: 275428


Generating TF-IDF matrix using sklearn vectorizer

In [92]:
corpus = []

# Generating corpus for input to sklearn vectorizer
for curr_file in files : 
  text = ""
  for word in curr_file[1] : 
    text += word + ' '
  corpus.append(text)
vectorizer = TfidfVectorizer()

# Generating TF-IDF matrix
X = vectorizer.fit_transform(corpus)
# Printing size of TF-IDF matrix generated using sklearn
print("Size of TF-IDF matrix using sklearn is:", X.shape[0]*X.shape[1])

Size of TF-IDF matrix using sklearn is: 40455720


Printing top five words and their TF-IDF scores from first five documents from the list

In [93]:
for i in range(0,5) :
  print("\nTop five words of document in own approach", files[i][0])
  print("{:<15}{:^10}{:>15}".format('Word', ':', 'Score'))
  # Row is list of (column index,non-zero TF-IDF score) for ith row
  row = sparse_tf_idf[i]
  row = sorted(row, key = lambda x: x[1], reverse = True)
  for j in range(0,min(len(row),5)) : 
    print("{:<15}{:^10}{:>15}".format(term_index[row[j][0]], ':', row[j][1]))

print('\n\n\n')

sklearn_terms = vectorizer.get_feature_names()
for i in range(0,5) :
  row = []
  print("\nTop five words of document in sklearn", files[i][0])
  print("{:<15}{:^10}{:>15}".format('Word', ':', 'Score'))
  for j in range(X.shape[1]) :
    if X[i,j] != 0 : 
      # Row is list of (column index,non-zero TF-IDF score) for ith row
      row.append([j,X[i,j]]) 
  row = sorted(row, key = lambda x: x[1], reverse = True)
  for j in range(0,min(len(row),5)) : 
    print("{:<15}{:^10}{:>15}".format(term_index[row[j][0]], ':', row[j][1]))


Top five words of document in own approach business/1041023_business_story_3916768.utf8
Word               :               Score
car                :     24.493521930766306
rate               :     16.359192948878146
loan               :     14.952144705341187
segment            :     14.364719415452956
pattanaik          :     13.927325821840865

Top five words of document in own approach business/1041209_business_story_4105116.utf8
Word               :               Score
uti                :     63.28531219996482
mip                :     58.283450706308045
hdfc               :     49.52763563475508
plus               :     39.9390466544597
equity             :     38.63367756186066

Top five words of document in own approach business/1041101_business_index.utf8
Word               :               Score
business           :     0.00047292505019538747
calcutta           :     0.00047292505019538747
telegraph          :     0.00047292505019538747

Top five words of document in own appr