In [1]:
#imports
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
import nltk
from datascience import *
import warnings
warnings.filterwarnings('ignore')

  matplotlib.use('agg', warn=False)
  matplotlib.use('agg', warn=False)


In [2]:
!gdown "https://drive.google.com/uc?id=1VR1bO9D3f3606BBYGWaS5Dt2cXfnzASU"

Downloading...
From: https://drive.google.com/uc?id=1VR1bO9D3f3606BBYGWaS5Dt2cXfnzASU
To: /content/business.zip
0.00B [00:00, ?B/s]2.98MB [00:00, 47.2MB/s]


In [3]:
#using python to unzip the file
import zipfile
with zipfile.ZipFile('business.zip', 'r') as zip_ref:
  zip_ref.extractall()

In [4]:
def import_files(path, mode='r'):
  """
  The function returns a list of iotext objects for each text file in the path passed as parameter
  """
  files = []
  for file_name in os.listdir(path):
    file = open(path + file_name, mode)
    files.append(file)
  return files

In [5]:
files = import_files("business/")

In [6]:
%%capture
#Reading all files 
documents = [BeautifulSoup(file.read()).get_text() for file in files]
documents


In [7]:

def tokenize_words(text):
  """
  The function tokenizes the words removes special characters, numbers and returns list of lowercase tokens
  """
  text = text.lower()
  tokens = nltk.regexp_tokenize(text, '[a-zA-Z]+', gaps=False)
  
  return [token.strip() for token in tokens ]

In [8]:

def remove_stopwords(words):
  """
  Removes the stop words from a list of words
  """
  nltk.download('stopwords')
  from nltk.corpus import stopwords
  engstopwords = stopwords.words('english')
  nostopwords = list(filter(lambda x: x not in engstopwords and len(x) != 1, words))
  return nostopwords

In [9]:

def displaywords(words, func,  limit,  label='New', display=True):
  """
  displays the words changed in table and returns list of words after applying the function
  """
  newwords = [func(word) for word in words]
  
  if display:
    tbl = Table()
    tbl = tbl.with_columns("Word", words, label + ' word', newwords)
    
    tbl.show(limit)
  return newwords


In [10]:

def lemmatize(words, display=True, limit=20 ):
  """
  prints the word and lemmatized word and also returns the list of lemmatized words
  """
  from nltk.stem import 	WordNetLemmatizer
  wnl = WordNetLemmatizer()
  nltk.download('wordnet')
  lemmatizedwords = displaywords(words, wnl.lemmatize, limit, "Lemmatized", display)
  return lemmatizedwords

In [11]:
def preprocess(text):
  return lemmatize(remove_stopwords(tokenize_words(text)),display=False)



In [12]:

processDocs = lambda x: {f"doc{i}":preprocess(x[i-1]) for i in range(1,len(x)+1) if len(preprocess(x[i-1])) != 0}




In [13]:
from sklearn import feature_extraction as fe
import math

In [14]:
#Using sklearn's TfidfVectorizer
x = fe.text.TfidfVectorizer()
y  = x.fit_transform(documents)
print(y.shape)
print(x.get_feature_names())

(2115, 25053)


In [15]:
x.get_feature_names()[3903]

'a5'

As we can see there are arround 4000 words which start with numbers. Out of 21000 words left, I got around 19000 words after removing stopwords and performing lemmatization.  

In [16]:
import collections
class tfidf:
  """

  """
  def __init__(self, docs, processDocs= lambda x:{f"Doc{i+1}":x[i].split() for i in range(len(x))}):
    """
    Term- document matrix

    Parameters: docs -> list of document texts 
                processDocs  -> function that performs preprocessing on list of text strings and returns dictonary containing all documents and words


    """
    assert type(docs) == list, "Docs should be list of documents"
    self.docDict = processDocs(docs)
    for x in self.docDict:
      self.docDict[x] = dict(collections.Counter(self.docDict[x]))  
    words = []
    for wrds in self.docDict.values():
      words.extend(wrds.keys())
    self.wordSet = set(words)
    self.tdm = pd.DataFrame(columns=self.wordSet)
  
  @property
  def num_docs(self):
    """
    Returns count of documents 
    """
    return len(self.docDict)

  @property
  def num_tokens(self):
    """
    Returns total number of unique words in the whole corpus 
    """
    return len(self.wordSet)
  
  @property
  def shape(self):
    """
    Returns shape of the term document matrix
    """
    return self.tdm.shape
  
  def lenDoc(self, doc):
    """
    Returns the length of doc passed as argument
    """
    return sum(self.docDict[doc].values())
  
  def get_tf(self, term, doc):
    """
    Returns the term frequency for the given term and doc 
    """
    if term in self.docDict[doc]:
      return self.docDict[doc][term]/self.lenDoc(doc)
    else:
      return 0

  def get_idf(self, term):
    """
    Returns the inverse document frequency for the given term 
    """
    idf = sum([self.docDict[doc][term] for doc in self.docDict if term in self.docDict[doc]])
    return math.log(self.num_docs/idf)

  def vectorizer(self):
    """
    Creates the tfidf matrix for words and documents
    """
    for word in self.wordSet:
      idf = self.get_idf(word)
      tfidfScore = np.array([self.get_tf(word, doc)*idf for doc in self.docDict])
      self.tdm[word] = tfidfScore
    return self.tdm
      
  def display(self, numdocs=100, numwords=20):
    """
    """
    return self.tdm.head()



In [17]:
%%capture
#Creating tfidf object
mytfidf = tfidf(documents, processDocs)



In [18]:
mytfidf.vectorizer()

Unnamed: 0,present,steepest,wohl,cited,smarajit,providence,clock,hobe,jaiprakash,brandy,voluminous,foam,contains,bike,motley,apac,sujan,stink,nusli,publicly,hpl,fx,casing,pani,chakravarty,optimism,closed,khandelwal,kannan,transmit,inquisitiveness,evaders,fortnightly,ngalenmmi,placard,lawyer,oem,purchasing,raced,spykar,...,aranda,madison,khimji,forget,ajita,docket,dsas,pinned,saddled,sleepless,whittled,telelink,plea,dweller,psf,preselected,bhagyanagar,gdc,euphoria,slogan,scored,shunting,tinker,impregnated,irctc,fortune,herbal,turbine,co,bargain,matri,prepares,zentz,unconquered,bidding,gwalior,sq,pd,scraping,describe
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.023697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.006192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2112,0.010705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011167,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2113,0.014519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#shape of the matrix
mytfidf.shape

(2115, 18708)

In [20]:
mytfidf.tdm

Unnamed: 0,present,steepest,wohl,cited,smarajit,providence,clock,hobe,jaiprakash,brandy,voluminous,foam,contains,bike,motley,apac,sujan,stink,nusli,publicly,hpl,fx,casing,pani,chakravarty,optimism,closed,khandelwal,kannan,transmit,inquisitiveness,evaders,fortnightly,ngalenmmi,placard,lawyer,oem,purchasing,raced,spykar,...,aranda,madison,khimji,forget,ajita,docket,dsas,pinned,saddled,sleepless,whittled,telelink,plea,dweller,psf,preselected,bhagyanagar,gdc,euphoria,slogan,scored,shunting,tinker,impregnated,irctc,fortune,herbal,turbine,co,bargain,matri,prepares,zentz,unconquered,bidding,gwalior,sq,pd,scraping,describe
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.023697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.006192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2110,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2111,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2112,0.010705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011167,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2113,0.014519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#Top 5 words for first 5 docs 
{f'doc{i+1}':sorted(dict(mytfidf.tdm.iloc[i,]).items(), key=lambda x: x[1], reverse=True)[:5] for i in range(5)}

{'doc1': [('car', 0.07343591591040191),
  ('pattanaik', 0.052037474940750936),
  ('extremely', 0.04970848687158587),
  ('ramakrishnan', 0.047393360518050334),
  ('segment', 0.04565388039827451)],
 'doc2': [('magnum', 0.01607123180658699),
  ('floating', 0.015048580236193074),
  ('st', 0.014588319886195732),
  ('principal', 0.014500372519653785),
  ('hsbc', 0.014206074143420082)],
 'doc3': [('index', 0.2834954536692579),
  ('present', 0.0),
  ('steepest', 0.0),
  ('wohl', 0.0),
  ('cited', 0.0)],
 'doc4': [('adc', 0.12746829575286842),
  ('bsnl', 0.10099630299983982),
  ('cellular', 0.08433969392515202),
  ('operator', 0.06493232576796924),
  ('tariff', 0.04280812890631944)],
 'doc5': [('ambani', 0.07444244744517746),
  ('mukesh', 0.07004770711311355),
  ('friday', 0.06637753734286098),
  ('dipped', 0.05673139813655315),
  ('close', 0.05353848961848214)]}

The values are a bit higher for the sklearn library because of normalization. 

In [22]:
[np.sort(i)[::-1] for i in y.toarray()[[0,1,2,3,4]]]

[array([0.37296105, 0.35297538, 0.24360235, ..., 0.        , 0.        ,
        0.        ]),
 array([0.35608512, 0.21290653, 0.20722268, ..., 0.        , 0.        ,
        0.        ]),
 array([0.96274539, 0.12094225, 0.12094225, ..., 0.        , 0.        ,
        0.        ]),
 array([0.4408676 , 0.38247971, 0.26597334, ..., 0.        , 0.        ,
        0.        ]),
 array([0.28284765, 0.25931518, 0.25008466, ..., 0.        , 0.        ,
        0.        ])]