In [1]:
#importing libraries
import gensim

from nltk.stem import WordNetLemmatizer, SnowballStemmer
#creating instance for snowballstemmer
stemmer = SnowballStemmer("english")

In [2]:
#creating functions for processing
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token).lower())
            
    return result

In [3]:
#extracting the saved text file from the directory
import glob
docs_text=[]  #store text of all documents
for files in glob.glob('*'): #all files in the present directory
    if(files.startswith('news')):
        handle = open(files,'r',encoding="utf-8",errors='ignore')
        docs_text.append(handle.read())
        handle.close()

In [4]:
#displaying the content of file
docs_text

['Three farm bills  the Essential Commodities (Amendment) Bill, the Farmers Produce Trade and Commerce (Promotion and Facilitation) Bill (commonly referred to as the APMC Bypass Bill), and the Farmers (Empowerment and Protection) Agreement on Price Assurance and Farm Services Bill  were cleared by Parliament amid protests from the Opposition. Sudha Narayanan (Associate Professor at the Indira Gandhi Institute of Development Research) and Arindam Banerjee (Associate Professor at the School of Liberal Studies at Ambedkar University) discuss the implications of the controversial Bills in a conversation moderated by Vikas Dhoot. Edited excerpts:\nSudha Narayanan: Sometimes we say, Be careful what you wish for, you may actually get it. These Bills represent fairly important changes in marketing regulation and are what many were asking for because of the flaws in the APMC [Agricultural Produce Market Committee] system. But at the same time, they are worrying for two reasons. The first is in 

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Pranavi's
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [6]:
#processing the data for tokenization and lemmatization

processed_docs = []
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

for doc in docs_text:
    processed_docs.append(preprocess(doc))
processed_docs

[['farm',
  'bill',
  'essenti',
  'commod',
  'amend',
  'farmer',
  'produc',
  'trade',
  'commerc',
  'promot',
  'facilit',
  'common',
  'refer',
  'apmc',
  'bypass',
  'farmer',
  'empower',
  'protect',
  'agreement',
  'price',
  'assur',
  'farm',
  'servic',
  'clear',
  'parliament',
  'amid',
  'protest',
  'opposit',
  'sudha',
  'narayanan',
  'associ',
  'professor',
  'indira',
  'gandhi',
  'institut',
  'develop',
  'research',
  'arindam',
  'banerje',
  'associ',
  'professor',
  'school',
  'liber',
  'studi',
  'ambedkar',
  'univers',
  'discuss',
  'implic',
  'controversi',
  'bill',
  'convers',
  'moder',
  'vika',
  'dhoot',
  'edit',
  'excerpt',
  'sudha',
  'narayanan',
  'care',
  'wish',
  'actual',
  'bill',
  'repres',
  'fair',
  'import',
  'chang',
  'market',
  'regul',
  'ask',
  'flaw',
  'apmc',
  'agricultur',
  'produc',
  'market',
  'committe',
  'time',
  'worri',
  'reason',
  'bill',
  'lacuna',
  'lack',
  'regul',
  'regulatori',
  '

In [7]:
#bag of words on the data set
dictionary = gensim.corpora.Dictionary(processed_docs)

In [8]:
print (dictionary)

Dictionary(640 unique tokens: ['abl', 'actual', 'address', 'advantag', 'aggreg']...)


In [9]:
#Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [10]:
#displaying the corpus saved
bow_corpus

[[(0, 1),
  (1, 9),
  (2, 1),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 7),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 15),
  (14, 2),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 2),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 2),
  (29, 1),
  (30, 2),
  (31, 1),
  (32, 12),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 2),
  (38, 4),
  (39, 1),
  (40, 2),
  (41, 1),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 3),
  (52, 2),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 5),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 7),
  (65, 1),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 2),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 3),
  (79, 3),
  (80, 2),
  (81, 1),
  (82, 1),
  (83, 2),
  (84, 2),
  (85, 1),
  (86, 2),
  (87, 2),
  (88, 1),
  (89, 1),
  (90, 2),
  (91, 

In [11]:
#creating the model for topic modelling
lda_model =  gensim.models.ldamodel.LdaModel(bow_corpus, num_topics =20,id2word = dictionary,passes = 4,random_state=100,
                                           update_every=1,)

In [12]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.010*"farmer" + 0.007*"price" + 0.006*"apmc" + 0.006*"player" + 0.005*"market" + 0.005*"bill" + 0.004*"procur" + 0.004*"actual" + 0.004*"go" + 0.004*"govern"


Topic: 1 
Words: 0.008*"python" + 0.006*"metadata" + 0.006*"output" + 0.006*"cell_typ" + 0.006*"text" + 0.006*"html" + 0.006*"site" + 0.006*"datafram" + 0.005*"request" + 0.005*"roam"


Topic: 2 
Words: 0.004*"python" + 0.004*"metadata" + 0.004*"html" + 0.004*"text" + 0.003*"request" + 0.003*"execution_count" + 0.003*"datafram" + 0.003*"farmer" + 0.003*"roam" + 0.003*"cell_typ"


Topic: 3 
Words: 0.005*"farmer" + 0.004*"bill" + 0.004*"price" + 0.004*"player" + 0.004*"apmc" + 0.003*"go" + 0.003*"privat" + 0.003*"actual" + 0.003*"python" + 0.003*"govern"


Topic: 4 
Words: 0.031*"python" + 0.020*"metadata" + 0.019*"request" + 0.018*"html" + 0.017*"execution_count" + 0.016*"text" + 0.015*"datafram" + 0.015*"site" + 0.015*"packag" + 0.014*"output"


Topic: 5 
Words: 0.006*"python" + 0.005*"html" + 0.005*"execution_

In [13]:
#extracting test set frpm the file folder
test_set=[]  #store text of all documents
for files in glob.glob('*'): #all files in the present directory
    if(files.startswith('Test')):
        handle = open(files,'r',encoding="utf-8",errors='ignore')
        test_set.append(handle.read())  
        handle.close()

In [14]:
test_set

["There has been an uproar after the Parliament passed three agriculture related bills.\n\nThe three bills that were passed are the Farmers' Produce Trade and Commerce (Promotion and Facilitation) Bill, Farmers (Empowerment and Protection) Agreement on Price Assurance and Farm Services Bill and Essential Commodities (Amendment) Bill.\n\nThe government said that the bills would transform the agriculture sector. It would also raise the farmers' income, the Centre said. Further the government had also promised double farmers' income by 2022 and the Centre said that the Bills will make the farmer independent of government controlled markets and fetch them a better price for their produce.\n\nThe Bills propose to create a system in which the farmers and traders can sell their purchase outside the Mandis. Further it also encourage intra-state trade and this proposes to reduce the cost of transportation.\n\nFurther the Bill formulates a framework on the agreements that enable farmers to engag

In [15]:
#processing for the test_set
processed_test=[]
for doc in test_set:
    processed_test.append(preprocess(doc))
processed_test

[['uproar',
  'parliament',
  'pass',
  'agricultur',
  'relat',
  'bill',
  'bill',
  'pass',
  'farmer',
  'produc',
  'trade',
  'commerc',
  'promot',
  'facilit',
  'farmer',
  'empower',
  'protect',
  'agreement',
  'price',
  'assur',
  'farm',
  'servic',
  'essenti',
  'commod',
  'amend',
  'govern',
  'say',
  'bill',
  'transform',
  'agricultur',
  'sector',
  'rais',
  'farmer',
  'incom',
  'centr',
  'say',
  'govern',
  'promis',
  'doubl',
  'farmer',
  'incom',
  'centr',
  'say',
  'bill',
  'farmer',
  'independ',
  'govern',
  'control',
  'market',
  'fetch',
  'better',
  'price',
  'produc',
  'bill',
  'propos',
  'creat',
  'farmer',
  'trader',
  'sell',
  'purchas',
  'outsid',
  'mandi',
  'encourag',
  'intra',
  'state',
  'trade',
  'propos',
  'reduc',
  'cost',
  'transport',
  'formul',
  'framework',
  'agreement',
  'enabl',
  'farmer',
  'engag',
  'agri',
  'busi',
  'compani',
  'retail',
  'export',
  'servic',
  'sale',
  'produc',
  'give',


In [16]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.010*"farmer" + 0.007*"price" + 0.006*"apmc" + 0.006*"player" + 0.005*"market" + 0.005*"bill" + 0.004*"procur" + 0.004*"actual" + 0.004*"go" + 0.004*"govern"


Topic: 1 
Words: 0.008*"python" + 0.006*"metadata" + 0.006*"output" + 0.006*"cell_typ" + 0.006*"text" + 0.006*"html" + 0.006*"site" + 0.006*"datafram" + 0.005*"request" + 0.005*"roam"


Topic: 2 
Words: 0.004*"python" + 0.004*"metadata" + 0.004*"html" + 0.004*"text" + 0.003*"request" + 0.003*"execution_count" + 0.003*"datafram" + 0.003*"farmer" + 0.003*"roam" + 0.003*"cell_typ"


Topic: 3 
Words: 0.005*"farmer" + 0.004*"bill" + 0.004*"price" + 0.004*"player" + 0.004*"apmc" + 0.003*"go" + 0.003*"privat" + 0.003*"actual" + 0.003*"python" + 0.003*"govern"


Topic: 4 
Words: 0.031*"python" + 0.020*"metadata" + 0.019*"request" + 0.018*"html" + 0.017*"execution_count" + 0.016*"text" + 0.015*"datafram" + 0.015*"site" + 0.015*"packag" + 0.014*"output"


Topic: 5 
Words: 0.006*"python" + 0.005*"html" + 0.005*"execution_