# NLP 
In this assignment we will scrap different websites to extract qoutes and speech excerpts of an eminent personality on different topics and perform topic modelling.

### Web scraping

In [None]:
import requests
from bs4 import BeautifulSoup as bs

def scrape(url):
    html = requests.get(url).content
    soup = bs(html,'html.parser')
    return soup

url = "https://www.barandbench.com/columns/dr-ambedkar-1949-constituent-assembly-speech"
soup = scrape(url)
text = soup.find_all('em')
text = [t.string.replace('\xa0','') for t in text if len(t.text.split())>4]

url = "https://swarajyamag.com/books/five-excerpts-from-ambedkars-historic-grammar-of-anarchy-speech"
soup = scrape(url)
quotes = soup.find_all('blockquote')
text.extend([quote.text for quote in quotes])

url = "https://quotes.thefamouspeople.com/b-r-ambedkar-3657.php"
soup = scrape(url)
quotes= soup.find_all('p',{'class':'text_content'})
text.extend([quote.text.replace('\n','') for quote in quotes])

url = "https://inspire99.com/5-brilliant-quotes-and-thoughts-by-ambedkar-ambedkar-quotes-on-ambedkar-jayanti/"
soup = scrape(url)
quotes = soup.find_all('strong')
quotes = [quote.text for quote in quotes if len(quote.text.split())>3]
del quotes[0]
text.extend(quotes)

In [8]:
print(len(text))

text

73


['The task of the Drafting Committee would have been a very difficult one if this Constituent Assembly has been merely a motleycrowd, a tessellated pavement without cement, a black stone here and a white stone there is which each member or each group was a law unto itself. There would have been nothing but chaos…',
 'The proceedings of this Constituent Assembly would have been very dull if all members had yielded to the rule of party discipline. Party discipline, in all its rigidity, would have converted this Assembly into a gathering of yes men.',
 'They were Mr. Kamath, Dr. PS. Deshmukh, Mr. Sidhva, Prof. Saxena & Pandit Thakur, Das Bhargava alongwith they I must mention Prof. K.T Shah and Pandit Hirday Nath Kunzru. The points they raised were mostly ideological.” ',
 'That I was not prepared to accept their suggestions does not diminish the value of their suggestions nor lessen the service they have rendered to the Assembly in enlivening its proceedings. I am grateful to them. But f

In [10]:
import sys
sys.setrecursionlimit(10**6)
import pickle

with open('text.txt','wb') as f:
    pickle.dump(text,f)

In [11]:
with open('text.txt','rb') as f:
    text = pickle.load(f)
text[5] 

'…The Constitution can provide only the organs of State such as the Legislature, the Executive and the Judiciary. The factors on which the working of those organs of the State depends are the people and the political parties they will set up as their instruments to carry out their wishes and their politics.'

### Topic modelling
Its unsupervised learning clustering model

**Latent Direchlet Allocation (LDA)**

In [13]:
import pandas as pd
df = pd.DataFrame(text, columns=['text'])
df.head()

Unnamed: 0,text
0,The task of the Drafting Committee would have ...
1,The proceedings of this Constituent Assembly w...
2,"They were Mr. Kamath, Dr. PS. Deshmukh, Mr. Si..."
3,That I was not prepared to accept their sugges...
4,"…however good a Constitution may be, it is sur..."


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english') # max_df discard words that show in 90% of docs, min_df the word must show up at least in 2 docs, stopwords automatically removes stopword
# No train test split in unsupervised learning

In [17]:
dtm = cv.fit_transform(df['text'])
dtm

<73x182 sparse matrix of type '<class 'numpy.int64'>'
	with 550 stored elements in Compressed Sparse Row format>

In [69]:
# Perform LDA
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2,random_state=42) # No of components is number of topics you expect

In [70]:
lda.fit(dtm)

LatentDirichletAllocation(n_components=2, random_state=42)

In [71]:
# Grab vocabulary of words
len(cv.get_feature_names()) # list of all words in doc
cv.get_feature_names()[5:10]
import random
cv.get_feature_names()[random.randint(0,182)] # get random words from document

'die'

In [72]:
# Grab topics
len(lda.components_) # Same as number of topics specified
type(lda.components_) # Numpy array of probabilities
lda.components_.shape # columns same as feature_names

(2, 182)

In [73]:
# Grab highest probability words per topic
for i in range(len(lda.components_)):
    print(f'Top 10 words for Topic #{i} :') 
    single_topic = lda.components_[i]
    single_topic.argsort() # sorts the values from lowest to highest returning a list of indices, gives the index we must look for highest probability
    # top 10 words with highest probability values is given by 
    print([cv.get_feature_names()[index] for index in single_topic.argsort()[-10:]])
    print('\n\n')

Top 10 words for Topic #0 :
['creed', 'political', 'place', 'parties', 'bad', 'good', 'society', 'country', 'people', 'constitution']



Top 10 words for Topic #1 :
['man', 'equality', 'fraternity', 'religion', 'great', 'india', 'nation', 'social', 'liberty', 'methods']





In [None]:
# Topic 0 is constitution and political science, topic 1 is Society

In [74]:
topic_results = lda.transform(dtm)
topic_results[5].round() # probabilities of document belonging to particular topic

array([1., 0.])

In [75]:
# Let's check if 1st doc is about constitution and political science
df['text'][5]

'…The Constitution can provide only the organs of State such as the Legislature, the Executive and the Judiciary. The factors on which the working of those organs of the State depends are the people and the political parties they will set up as their instruments to carry out their wishes and their politics.'

In [76]:
topic_results[5].argmax()

0

In [77]:
df['Topic']=topic_results.argmax(axis=1)
df

Unnamed: 0,text,Topic
0,The task of the Drafting Committee would have ...,0
1,The proceedings of this Constituent Assembly w...,0
2,"They were Mr. Kamath, Dr. PS. Deshmukh, Mr. Si...",1
3,That I was not prepared to accept their sugges...,0
4,"…however good a Constitution may be, it is sur...",0
...,...,...
68,"“I like the religion that teaches liberty, equ...",1
69,“Life should be great rather than long”,1
70,“Men are mortal. So are ideas. An idea needs p...,1
71,“Religion must mainly be a matter of principle...,1


In [87]:
df['Topic'] = df['Topic'].map({0:'Political Science',1:'Society'})

**Non Negative matrix factorisation**
Dimension reduction + clustering

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.90,min_df=2,stop_words='english')
dtm = tfidf.fit_transform(df['text'])
dtm # docs v/s number of words matrix

<73x182 sparse matrix of type '<class 'numpy.float64'>'
	with 550 stored elements in Compressed Sparse Row format>

In [81]:
from sklearn.decomposition import NMF
model = NMF(n_components=2,random_state=42)
model.fit(dtm)

NMF(n_components=2, random_state=42)

In [84]:
for index,topic in enumerate(model.components_):
    print(f'The top 10 words for topic #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n\n')

The top 10 words for topic #0
['society', 'produce', 'supremacy', 'things', 'teaches', 'like', 'religion', 'equality', 'fraternity', 'liberty']



The top 10 words for topic #1
['assembly', 'long', 'country', 'independence', 'lost', 'nation', 'great', 'life', 'people', 'constitution']





In [None]:
# Topic 0 is society and topic 1 is constitution and political science

In [86]:
topic_results = model.transform(dtm)
topic_results[0] #coefficient values (not probabilities like lda)
topic_results[0].argmax()

1

In [88]:
df['Topic_nmf'] = topic_results.argmax(axis=1)

In [89]:
df['Topic_nmf'] = df['Topic_nmf'].map({1:'Political Science',0:'Society'})

In [90]:
df

Unnamed: 0,text,Topic,Topic_nmf
0,The task of the Drafting Committee would have ...,Political Science,Political Science
1,The proceedings of this Constituent Assembly w...,Political Science,Political Science
2,"They were Mr. Kamath, Dr. PS. Deshmukh, Mr. Si...",Society,Political Science
3,That I was not prepared to accept their sugges...,Political Science,Political Science
4,"…however good a Constitution may be, it is sur...",Political Science,Political Science
...,...,...,...
68,"“I like the religion that teaches liberty, equ...",Society,Society
69,“Life should be great rather than long”,Society,Political Science
70,“Men are mortal. So are ideas. An idea needs p...,Society,Political Science
71,“Religion must mainly be a matter of principle...,Society,Society


### Text Cleaning

In [91]:
text[0:3]

['The task of the Drafting Committee would have been a very difficult one if this Constituent Assembly has been merely a motleycrowd, a tessellated pavement without cement, a black stone here and a white stone there is which each member or each group was a law unto itself. There would have been nothing but chaos…',
 'The proceedings of this Constituent Assembly would have been very dull if all members had yielded to the rule of party discipline. Party discipline, in all its rigidity, would have converted this Assembly into a gathering of yes men.',
 'They were Mr. Kamath, Dr. PS. Deshmukh, Mr. Sidhva, Prof. Saxena & Pandit Thakur, Das Bhargava alongwith they I must mention Prof. K.T Shah and Pandit Hirday Nath Kunzru. The points they raised were mostly ideological.” ']

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
len(nlp.Defaults.stop_words)

nlp.vocab['letter'].is_stop # reports whether 'letter' is stopword or not
# to add word 'letter' as stopword 
nlp.Defaults.stop_words.add('letter')
nlp.vocab['letter'].is_stop = True
# to remove a stopword
nlp.Defaults.stop_words.remove('letter')
nlp.vocab['letter'].is_stop = False

In [117]:
t = text
# Removing punctuations
import re
t = re.sub('\W+',' ',''.join(t))
t
# Normalisation
t = [word.lower() for word in t.split()]
t
len(t)
# Stopwords removal
t = [word for word in t if word not in nlp.Defaults.stop_words]
t
# lemetisation
doc = nlp(str(' '.join(t)))
lemmas = [token.lemma_ for token in doc]
t = list(set(lemmas))
t

['3',
 'conduct',
 'kamath',
 'relentless',
 'worship',
 'place',
 'watch',
 'andfull',
 'avail',
 'realise',
 'reference',
 'critic',
 'enliven',
 'prophet',
 'majority',
 'mr',
 'people',
 'jaichand',
 'ascend',
 'reform',
 'mobile',
 'path',
 'constitution',
 'seek',
 'political',
 '1857',
 'believe',
 'day',
 'states',
 'service',
 'maker',
 'castesbut',
 'australia',
 'refrain',
 'resolutely',
 'administer',
 'seal',
 'try',
 'revolutionary',
 'method',
 'assure',
 'standard',
 'respect',
 'share',
 'fight',
 'independent',
 'solemnly',
 'degenerate',
 'associated',
 'king',
 'importance',
 'anxiety',
 'thorough',
 'discontent',
 'position',
 'think',
 'large',
 'proceeding',
 'ready',
 'daniel',
 'principles',
 'kind',
 'tessellate',
 'krishnamachari',
 'master',
 'commander',
 'miracle',
 'psychological',
 'coat',
 'bhakti',
 'reformer',
 'throw',
 'product',
 'peoplewill',
 'invasion',
 'lose',
 'agents',
 'liberating',
 'propagation',
 'creed',
 'creation',
 'cash',
 'seriousl

In [122]:
t = text
doc = nlp(str(''.join(t)))
# POS
# Filtering adjectives
adj = [token.text for token in doc if token.pos_=='ADJ']
set(adj)

{'British',
 'HinduA',
 'Indian',
 'Political',
 'absolute',
 'anxious',
 'bad',
 'basic',
 'better',
 'bitter',
 'black',
 'bloody',
 'certain',
 'civil',
 'closest',
 'compassionate',
 'constable',
 'constitutional',
 'courageous',
 'deeper',
 'different',
 'difficult',
 'dissatisfied',
 'diverse',
 'dull',
 'economic',
 'eminent',
 'enough',
 'eventual',
 'extraordinary',
 'facile',
 'fellow',
 'few',
 'first',
 'fit',
 'full',
 'fundamental',
 'futile',
 'general',
 'good',
 'grateful',
 'great',
 'human',
 'ideal',
 'ideological',
 'important',
 'independent',
 'just',
 'large',
 'last',
 'likely',
 'logical',
 'long',
 'many',
 'mechanical',
 'mere',
 'military',
 'minded',
 'mobile',
 'mortal',
 'much',
 'natural',
 'necessary',
 'non',
 'old',
 'only',
 'open',
 'opposing',
 'other',
 'own',
 'parliamentary',
 'politic',
 'political',
 'poor',
 'prepared',
 'principal',
 'private',
 'profound',
 'psychological',
 'ready',
 'relentless',
 'religious',
 'revolutionary',
 'rich',


# Thank you!