# Importing Dependencies

In [1]:
import requests
import bs4
import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.stem.cistem import Cistem



# Tokenizers

In [5]:
def lemmatize_stemming(text):
    stemmer = Cistem()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

# Extracting TOI Articles

In [29]:
class times_of_india():
    def __init__(self):
        self.root = 'https://timesofindia.indiatimes.com/'
        self.page = requests.get(self.root)
        self.soup = bs4.BeautifulSoup(self.page.text, 'lxml')
        self.hreflinks = {}
        self.redirects = []
        self.database = []
        self.imp_sections = ['div.top-story', 'div.tabcontent.mostshared', 'div.tabcontent.mostcommented','div.tabcontent.latest',  'div.tabcontent.mosttrending']
        
    def extract_href(self, imp_sections, soup, output):
        for sections in imp_sections:
            for section in soup.select(sections):
                all_links = section.find_all("a")
                for link in all_links:
                    if link.text[0:3] != 'See':
                        self.hreflinks[link.text] = link['href']
                    else:    
                        output.append(link['href'])
    
    def extract_text(self):
        self.extract_href(self.imp_sections, self.soup, self.redirects)
        for redirect_link in self.redirects:
            output_links = []
            page_red = requests.get(self.root +"/" + redirect_link)
            soup_red = bs4.BeautifulSoup(page_red.text, "lxml")
            self.extract_href(['div.widget.box13'], soup_red, output_links)
        for i, article_text in enumerate(self.hreflinks.values()):
            page = requests.get(self.root + "/" + article_text)
            soup = bs4.BeautifulSoup(page.text, 'lxml')
            article = soup.select('div._3WlLe.clearfix')
            if len(article):
                self.database.append(preprocess(article[0].text))
                with open(r"./output/output_" + str(i) + ".txt" , "w",  encoding="utf-8") as txt_file:
                    txt_file.write(str("") + article[0].text + str(""))
        

In [30]:
toi = times_of_india()

In [31]:
toi.extract_text()

# Making a Dictionary & Bow Corpus

In [34]:
dictionary = gensim.corpora.Dictionary(toi.database)

In [36]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 thing
1 temporary
2 barricad
3 sta
4 bord
5 stop
6 peopl
7 cross
8 amid
9 coronaviru
10 pandemic


In [38]:
bow_corpus = [dictionary.doc2bow(doc) for doc in toi.database]

In [39]:
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 3 ("sta") appears 1 time.
Word 6 ("peopl") appears 2 time.
Word 28 ("distric") appears 2 time.
Word 37 ("stand") appears 1 time.
Word 61 ("repor") appears 2 time.
Word 62 ("covid") appears 4 time.
Word 63 ("cas") appears 5 time.
Word 67 ("monday") appears 1 time.
Word 96 ("delhi") appears 1 time.
Word 97 ("sunday") appears 1 time.
Word 99 ("say") appears 5 time.
Word 115 ("governm") appears 1 time.
Word 117 ("work") appears 1 time.
Word 123 ("hour") appears 1 time.
Word 141 ("crisi") appears 1 time.
Word 148 ("downloadth") appears 1 time.
Word 149 ("tim") appears 2 time.
Word 150 ("india") appears 4 time.
Word 151 ("new") appears 1 time.
Word 152 ("lat") appears 1 time.
Word 153 ("daily") appears 2 time.
Word 154 ("morning") appears 2 time.
Word 155 ("newspap") appears 1 time.
Word 156 ("order") appears 1 time.
Word 167 ("tak") appears 2 time.
Word 169 ("total") appears 4 time.
Word 171 ("death") appears 1 time.
Word 173 ("viru") appears 1 time.
Word 179 ("numb") appears 2 time.
W

In [40]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [41]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.021*"india" + 0.015*"military" + 0.013*"tim" + 0.013*"say" + 0.013*"governm" + 0.013*"billio" + 0.012*"sta" + 0.012*"offic" + 0.010*"expenditur" + 0.008*"repor"


Topic: 1 
Words: 0.020*"india" + 0.020*"say" + 0.019*"typ" + 0.015*"icmr" + 0.012*"kit" + 0.011*"research" + 0.010*"covid" + 0.010*"tes" + 0.010*"coronaviru" + 0.010*"procur"


Topic: 2 
Words: 0.019*"tobacco" + 0.016*"produc" + 0.016*"adverti" + 0.013*"crick" + 0.010*"india" + 0.010*"health" + 0.010*"bcci" + 0.007*"tim" + 0.006*"director" + 0.006*"ask"


Topic: 3 
Words: 0.018*"india" + 0.015*"tes" + 0.012*"crea" + 0.011*"tim" + 0.010*"kit" + 0.010*"say" + 0.009*"sta" + 0.009*"repor" + 0.008*"governm" + 0.008*"fak"


Topic: 4 
Words: 0.038*"cas" + 0.024*"covid" + 0.019*"say" + 0.018*"sta" + 0.016*"repor" + 0.016*"total" + 0.016*"coronaviru" + 0.015*"india" + 0.014*"mini" + 0.012*"positiv"


Topic: 5 
Words: 0.029*"china" + 0.020*"india" + 0.017*"chi" + 0.015*"south" + 0.011*"ship" + 0.009*"isla" + 0.009*"o