In [18]:
import _pickle as cPickle
import os
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import csv
import sys
from nltk.tokenize import word_tokenize
import string
import re
from collections import Counter

In [19]:
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

csv.field_size_limit(sys.maxsize)

9223372036854775807

In [20]:
DATA_DIR = "../speeches"
RESULTS_DIR = "../speeches/results"
CSV_DIR = RESULTS_DIR + "/csvs"
PICKLES_DIR = RESULTS_DIR + "/pickles"
SPEECHES_DIR = DATA_DIR + "/transcripts_english"
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)
os.makedirs(PICKLES_DIR, exist_ok=True)
os.makedirs(SPEECHES_DIR, exist_ok=True)

## Declare Stopwords

In [21]:
with open("remove_words.txt", "r") as f:
    MORE_STOPWORDS = f.read().split("\n")

STOPWORDS = stopwords.words('english')
WC_STOP = set(list(STOPWORDS) + MORE_STOPWORDS)

## Read

In [5]:
urls = []
dates = []
speeches = []

with open(DATA_DIR + "/mann_ki_baat.csv", "r", encoding="utf-8", newline="") as file:
    reader = csv.reader(file, delimiter="|")
    for row in reader:
        urls.append(row[0])
        dates.append(row[1])
        speeches.append(row[2])

## Write each speech into txt file

In [12]:
i = 1

month_to_num = {}
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

for month in months:
    if i < 10:
        month_to_num[month] = "0" + str(i)
    else:
        month_to_num[month] = str(i)
    i += 1

In [15]:
with open(DATA_DIR + "/mann_ki_baat.csv", "r", encoding="utf-8", newline="") as file:
    reader = csv.reader(file, delimiter="|")
    for row in reader:
        date = row[1].split(',')[1].strip() + "_" + month_to_num[row[1].split(' ')[1].strip(',')] + "_" + row[1].split(' ')[0]
        with open(SPEECHES_DIR+f"/{date}.txt", "w") as f:
            f.write(row[2])

## Processing

In [22]:
def remove_nums(text):
    return re.sub("\d", "", text)

In [23]:
def lemmatize_row(row):
    content = remove_nums(row[2])
    lemmatized_content = ""
    for word in content:
        lemmatized_content += lemmatizer.lemmatize(word)
    row[2] =  lemmatized_content
    return row

In [24]:
def clean(data):
    # Convert to lower case
    data = data.lower()
    # Make a look-up for punctuations
    punctuation_table = str.maketrans("", "", string.punctuation)
    # remove punctuations, multiple consecutive spaces
    data = re.sub(" +", " ", data.translate(punctuation_table))
    # split into words
    words = word_tokenize(data)
    # remove stopwords
    cleaned_words = [word for word in words if word not in STOPWORDS]
    return " ".join(cleaned_words)

In [9]:
%%time
cleaned_speeches = [clean(speech) for speech in speeches]

CPU times: user 1.81 s, sys: 11 ms, total: 1.82 s
Wall time: 1.83 s


In [10]:
with open(CSV_DIR+"/cleaned_mann_ki_baat.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter="|", quoting=csv.QUOTE_MINIMAL)
    for i in range(len(urls)):
        print(urls[i])
        writer.writerow([urls[i], dates[i], cleaned_speeches[i]])

https://www.pmindia.gov.in/en/news_updates/english-rendering-of-transcript-of-the-special-episode-of-mann-ki-baat-pm-shri-narendra-modi-and-us-president-shri-barack-obama-share-their-thoughts-on-radio/
https://www.pmindia.gov.in/en/news_updates/english-rendering-of-the-text-of-pms-first-address-to-the-nation-on-radio/
https://www.pmindia.gov.in/en/news_updates/english-rendering-of-text-of-prime-ministers-mann-ki-baat-on-all-india-radio-on-26th-april-2015/
https://www.pmindia.gov.in/en/news_updates/english-rendering-of-text-of-prime-ministers-mann-ki-baat-on-all-india-radio-in-february-2015/
https://www.pmindia.gov.in/en/news_updates/english-rendering-of-pms-mann-ki-baat-address-on-all-india-radio/
https://www.pmindia.gov.in/en/news_updates/english-rendering-of-text-of-prime-ministers-mann-ki-baat-on-all-india-radio/
https://www.pmindia.gov.in/en/news_updates/english-rendering-text-of-pms-mann-ki-baat-on-all-india-radio-on-27-12-2015/
https://www.pmindia.gov.in/en/news_updates/english-r

## Make Pickle File for easier loading

In [26]:
new_csv = []
mann_ki_baat = {}

In [27]:
with open(CSV_DIR + "/cleaned_mann_ki_baat.csv", "r", encoding="utf-8", newline="") as file:
    reader = csv.reader(file, delimiter="|")
    for row in reader:
        lemmatized_row = lemmatize_row(row)
        new_csv.append(lemmatized_row)
        month = str(row[1].split(',')[0].split(' ')[1] + row[1].split(',')[1])
        if month not in mann_ki_baat:
            mann_ki_baat[month] = []
            mann_ki_baat[month].append({"url": row[0], "date": row[1], "speech": row[2]})
        else:
            mann_ki_baat[month].append({"url": row[0], "date": row[1], "speech": row[2]})


In [28]:
print(len(list(mann_ki_baat.keys())))
print(mann_ki_baat.keys())

84
dict_keys(['Jan 2015', 'Oct 2014', 'Apr 2015', 'Feb 2015', 'Mar 2015', 'Nov 2014', 'Dec 2015', 'Sep 2015', 'Nov 2015', 'Feb 2016', 'Jan 2016', 'Mar 2016', 'Oct 2015', 'May 2016', 'Jun 2016', 'Jul 2016', 'Aug 2016', 'Sep 2016', 'Oct 2016', 'Dec 2016', 'Jan 2017', 'Mar 2017', 'Apr 2017', 'Jun 2017', 'Jul 2017', 'Aug 2017', 'Sep 2017', 'Oct 2017', 'Dec 2017', 'Jan 2018', 'Feb 2018', 'Mar 2018', 'Apr 2018', 'May 2015', 'Dec 2014', 'Jul 2015', 'Jun 2018', 'Aug 2015', 'Oct 2018', 'Sep 2018', 'Dec 2018', 'Aug 2018', 'Feb 2019', 'Nov 2016', 'Jul 2019', 'Aug 2019', 'Sep 2019', 'Feb 2017', 'Oct 2019', 'Nov 2019', 'Dec 2019', 'Jan 2020', 'Feb 2020', 'Apr 2020', 'Mar 2020', 'May 2020', 'Jun 2020', 'Jul 2020', 'Aug 2020', 'Oct 2020', 'Sep 2020', 'May 2017', 'Nov 2020', 'Dec 2020', 'Jan 2021', 'Feb 2021', 'Mar 2021', 'Apr 2021', 'Jul 2021', 'May 2021', 'Jun 2021', 'Aug 2021', 'Oct 2021', 'Sep 2021', 'Dec 2021', 'Nov 2021', 'Jan 2022', 'Mar 2022', 'Feb 2022', 'May 2018', 'Jul 2018', 'Jan 2019', 'N

In [33]:
with open(CSV_DIR+"/lemmatized_mann_ki_baat.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile, delimiter="|", quoting=csv.QUOTE_MINIMAL)
    for i in range(len(new_csv)):
        writer.writerow(new_csv[i])

In [34]:
with open(PICKLES_DIR+"/mann_ki_baat.pkl", "wb") as f:
    cPickle.dump(mann_ki_baat, f)

### Dividing data by Events

In [35]:
# Demonetisation ; starting date: 8 November 2016

demonetisation_collection = {}
demonetisation_months = ['Nov 2016', 'Dec 2016', 'Jan 2017', 'Feb 2017', 'Mar 2017']

for month in demonetisation_months:
    if month not in demonetisation_collection:
            demonetisation_collections = ''
            demonetisation_collections += mann_ki_baat[month][0]['speech']

demonetisation_collection['demonetisation'] = demonetisation_collections

with open(PICKLES_DIR+"/mann_ki_baat_demonetisation.pkl", "wb") as f:
    cPickle.dump(demonetisation_collection, f)

In [36]:
# caa_nrc ; starting date:

caa_nrc_collection = {}
caa_nrc_months = ['Dec 2019', 'Jan 2020', 'Feb 2020', 'Mar 2020', 'Apr 2020']

for month in caa_nrc_months:
    if month not in caa_nrc_collection:
            caa_nrc_collections = ''
            caa_nrc_collections += mann_ki_baat[month][0]['speech']

caa_nrc_collection['caa_nrc'] = caa_nrc_collections

with open(PICKLES_DIR+"/mann_ki_baat_caa_nrc.pkl", "wb") as f:
    cPickle.dump(caa_nrc_collection, f)

In [29]:
# farmers ; starting date: 

farmers_collection = {}
farmers_months = ['Aug 2020', 'Sep 2020', 'Oct 2020', 'Nov 2020', 'Dec 2020', 'Jan 2021', 'Feb 2021', 'Mar 2021', 'Apr 2021', 'May 2021', 'Jun 2021', 'Jul 2021', 'Aug 2021', 'Sep 2021', 'Oct 2021', 'Nov 2021']

for month in farmers_months:
    if month not in farmers_collection:
            farmers_collections = ''
            farmers_collections += mann_ki_baat[month][0]['speech']

farmers_collection['farmers'] = farmers_collections

with open(PICKLES_DIR+"/mann_ki_baat_farmers.pkl", "wb") as f:
    cPickle.dump(farmers_collection, f)

## Check frequency of words

In [30]:
RESULTS_DIR_2 = RESULTS_DIR + "/frequencies"
RESULTS_DIR_3 = RESULTS_DIR_2 + "/demonetisation"
RESULTS_DIR_4 = RESULTS_DIR_2 + "/caa_nrc"
RESULTS_DIR_5 = RESULTS_DIR_2 + "/farmers"

In [38]:
%%time
os.makedirs(RESULTS_DIR_2, exist_ok=True)
NUM_COUNTS = 500

with open(PICKLES_DIR + "/mann_ki_baat.pkl", "rb") as f:
    content = cPickle.load(f)

whole_word_freq = Counter()
for k in content.keys():
    word_freq = Counter()
    for word in content[k][0]['speech'].split(" "):
        if word not in WC_STOP:
            word_freq[word] += 1
            whole_word_freq[word] += 1
    common_words = word_freq.most_common(NUM_COUNTS)
    with open(RESULTS_DIR_2 + f"/{k}.txt", "w") as f:
        for word_count in common_words:
            f.write(f"{word_count[0]} : {word_count[1]}\n")

common_words = whole_word_freq.most_common(NUM_COUNTS)
with open(RESULTS_DIR_2 + "/All.txt", "w") as f:
    for word_count in common_words:
        f.write(f"{word_count[0]} : {word_count[1]}\n")

CPU times: user 158 ms, sys: 0 ns, total: 158 ms
Wall time: 157 ms


In [39]:
%%time
os.makedirs(RESULTS_DIR_3, exist_ok=True)
NUM_COUNTS = 500

with open(PICKLES_DIR + "/mann_ki_baat_demonetisation.pkl", "rb") as f:
    content = cPickle.load(f)

whole_word_freq = Counter()
for k in content.keys():
    word_freq = Counter()
    for word in content[k].split(" "):
        if word not in WC_STOP:
            word_freq[word] += 1
            whole_word_freq[word] += 1
    common_words = word_freq.most_common(NUM_COUNTS)
    with open(RESULTS_DIR_3 + f"/{k}.txt", "w") as f:
        for word_count in common_words:
            f.write(f"{word_count[0]} : {word_count[1]}\n")

common_words = whole_word_freq.most_common(NUM_COUNTS)

CPU times: user 5.57 ms, sys: 0 ns, total: 5.57 ms
Wall time: 5.07 ms


In [40]:
%%time
os.makedirs(RESULTS_DIR_4, exist_ok=True)
NUM_COUNTS = 500

with open(PICKLES_DIR + "/mann_ki_baat_caa_nrc.pkl", "rb") as f:
    content = cPickle.load(f)

whole_word_freq = Counter()
for k in content.keys():
    word_freq = Counter()
    for word in content[k].split(" "):
        if word not in WC_STOP:
            word_freq[word] += 1
            whole_word_freq[word] += 1
    common_words = word_freq.most_common(NUM_COUNTS)
    with open(RESULTS_DIR_4 + f"/{k}.txt", "w") as f:
        for word_count in common_words:
            f.write(f"{word_count[0]} : {word_count[1]}\n")

common_words = whole_word_freq.most_common(NUM_COUNTS)

CPU times: user 4.89 ms, sys: 0 ns, total: 4.89 ms
Wall time: 3.93 ms


In [41]:
%%time
os.makedirs(RESULTS_DIR_5, exist_ok=True)
NUM_COUNTS = 500

with open(PICKLES_DIR + "/mann_ki_baat_farmers.pkl", "rb") as f:
    content = cPickle.load(f)

whole_word_freq = Counter()
for k in content.keys():
    word_freq = Counter()
    for word in content[k].split(" "):
        if word not in WC_STOP:
            word_freq[word] += 1
            whole_word_freq[word] += 1
    common_words = word_freq.most_common(NUM_COUNTS)
    with open(RESULTS_DIR_5 + f"/{k}.txt", "w") as f:
        for word_count in common_words:
            f.write(f"{word_count[0]} : {word_count[1]}\n")

common_words = whole_word_freq.most_common(NUM_COUNTS)

CPU times: user 5.39 ms, sys: 0 ns, total: 5.39 ms
Wall time: 4.3 ms


## Wordclouds

In [None]:
with open(PICKLES_DIR + "/mann_ki_baat.pkl", "rb") as f:
    content = cPickle.load(f)

In [27]:
total_content = []
content_string = ''
for month in content.keys():
    total_content.append(content[k][0]['speech'])
    content_string += content[k][0]['speech']

In [28]:
for i,speech in enumerate(total_content):
    with open(RESULTS_DIR + "/full_speeches.txt", "a") as f:
        f.write("**********\n" + total_content[i] + "**********\n")

In [34]:
freq_words = WordCloud(stopwords=WC_STOP).process_text(content_string)
wordcloud = WordCloud(stopwords=WC_STOP, max_words=100).generate(content_string)
wdcldsvg = wordcloud.to_svg()
with open(RESULTS_DIR +"/wordclouds/ALL.svg", "w") as f:
    f.write(wdcldsvg)

In [43]:
for month, text in mann_ki_baat.items():
    wordcloud = WordCloud(stopwords=WC_STOP).generate(text[0]['speech'])
    wdcldsvg = wordcloud.to_svg()
    with open(RESULTS_DIR + f"/wordclouds/each_speech/{month}.svg", "w") as f:
        f.write(wdcldsvg)

In [32]:
stopwords_extra = ['people', 'india', 'world', 'dear', 'country']
WC_STOP = WC_STOP | set(stopwords_extra)

In [33]:
with open(PICKLES_DIR + "/mann_ki_baat_demonetisation.pkl", "rb") as f:
    content = cPickle.load(f)

content_string = ''
for month in content.keys():
    content_string += content[month]

freq_words = WordCloud(stopwords=WC_STOP).process_text(content_string)
wordcloud = WordCloud(stopwords=WC_STOP, max_words=100).generate(content_string)
wdcldsvg = wordcloud.to_svg()
with open(RESULTS_DIR +"/wordclouds/demonetisation.svg", "w") as f:
    f.write(wdcldsvg)

In [34]:
with open(PICKLES_DIR + "/mann_ki_baat_caa_nrc.pkl", "rb") as f:
    content = cPickle.load(f)

content_string = ''
for month in content.keys():
    content_string += content[month]

freq_words = WordCloud(stopwords=WC_STOP).process_text(content_string)
wordcloud = WordCloud(stopwords=WC_STOP, max_words=100).generate(content_string)
wdcldsvg = wordcloud.to_svg()
with open(RESULTS_DIR +"/wordclouds/caa_nrc.svg", "w") as f:
    f.write(wdcldsvg)

In [42]:
with open(PICKLES_DIR + "/mann_ki_baat_farmers.pkl", "rb") as f:
    content = cPickle.load(f)

content_string = ''
for month in content.keys():
    content_string += content[month]

freq_words = WordCloud(stopwords=WC_STOP).process_text(content_string)
wordcloud = WordCloud(stopwords=WC_STOP, max_words=100).generate(content_string)
wdcldsvg = wordcloud.to_svg()
with open(RESULTS_DIR +"/wordclouds/farmers.svg", "w") as f:
    f.write(wdcldsvg)