In [1]:
import csv
import nltk
from nltk.corpus import stopwords
import sys
import string

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /home/raychithra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/raychithra/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/raychithra/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/raychithra/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [7]:
DATA_DIR = "../speeches"
RESULTS_DIR = "../speeches/results/NER"

### Declare Stopwords ###

In [8]:
with open("remove_words.txt", "r") as f:
    MORE_STOPWORDS = f.read().split("\n")

STOPWORDS = stopwords.words('english')
WC_STOP = set(list(STOPWORDS) + MORE_STOPWORDS)

In [9]:
urls = []
dates = []
speeches = []

with open(DATA_DIR + "/mann_ki_baat.csv", "r", encoding="utf-8", newline="") as file:
    reader = csv.reader(file, delimiter="|")
    for row in reader:
        urls.append(row[0])
        dates.append(row[1])
        speeches.append(row[2])

### Implement Named Entity Recognition ###

##### Names

In [9]:
with open(RESULTS_DIR + "/names.txt", "w") as f:
    for text in speeches:
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                # print(chunk)
                if hasattr(chunk, 'label'):
                    # print(chunk.label(), ' '.join(c[0] for c in chunk))

                    if chunk.label() == "PERSON":
                        f.write(' '.join(c[0] for c in chunk) + "\n")


##### GPE

In [14]:
with open(RESULTS_DIR + "/GPE.txt", "w") as f:
    for text in speeches:
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                # print(chunk)
                if hasattr(chunk, 'label'):
                    # print(chunk.label(), ' '.join(c[0] for c in chunk))

                    if chunk.label() == "GPE":
                        f.write(' '.join(c[0] for c in chunk) + "\n")

##### Organizations

In [17]:
with open(RESULTS_DIR + "/org.txt", "w") as f:
    for text in speeches:
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                # print(chunk)
                if hasattr(chunk, 'label'):
                    # print(chunk.label(), ' '.join(c[0] for c in chunk))

                    if chunk.label() == "ORGANIZATION":
                        f.write(' '.join(c[0] for c in chunk) + "\n")

### Frequencies for nouns ###

##### Names

In [5]:
name_freq = {}
with open(RESULTS_DIR + "/names.txt", "r") as f:
    names = f.read().split("\n")

for word in names:
    if word not in name_freq.keys():
        name_freq[word] = 1
    else:
        name_freq[word] += 1

ordered_name_freq = dict(sorted(name_freq.items(), key=lambda item: item[1], reverse = True))

In [42]:
with open(RESULTS_DIR + "/name_freq.txt", "w") as f:
    for key in ordered_name_freq.keys():
        f.write(key + " : " + str(ordered_name_freq[key]) +"\n")

##### GPE

In [15]:
gpe_freq = {}
with open(RESULTS_DIR + "/GPE.txt", "r") as f:
    gpes = f.read().split("\n")

for word in gpes:
    if word not in gpe_freq.keys():
        gpe_freq[word] = 1
    else:
        gpe_freq[word] += 1

ordered_gpe_freq = dict(sorted(gpe_freq.items(), key=lambda item: item[1], reverse = True))

In [16]:
with open(RESULTS_DIR + "/GPE_freq.txt", "w") as f:
    for key in ordered_gpe_freq.keys():
        f.write(key + " : " + str(ordered_gpe_freq[key]) +"\n")

##### Organizations

In [18]:
org_freq = {}
with open(RESULTS_DIR + "/org.txt", "r") as f:
    orgs = f.read().split("\n")

for word in orgs:
    if word not in org_freq.keys():
        org_freq[word] = 1
    else:
        org_freq[word] += 1

ordered_org_freq = dict(sorted(org_freq.items(), key=lambda item: item[1], reverse = True))

In [19]:
with open(RESULTS_DIR + "/org_freq.txt", "w") as f:
    for key in ordered_org_freq.keys():
        f.write(key + " : " + str(ordered_org_freq[key]) +"\n")

##### Modi index and India index

In [10]:
modi_index = 0
india_index = 0

In [11]:
for text in speeches:
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            # print(chunk)
            if hasattr(chunk, 'label'):
                noun = ' '.join(c[0] for c in chunk)
                # print(chunk.label(), ' '.join(c[0] for c in chunk))
                if noun == "Modi" or noun == "Narendra Modi":
                    modi_index += 1
                if noun == "India" or noun == "Indian" or noun == "Bharat" or noun == "Hindustan":
                    india_index += 1

print("Modi_Index : " + str(modi_index))
print("India_Index : " + str(india_index))

Modi_Index : 196

India_Index : 1462


##### Overall frequencies

In [10]:
noun_freq = {}

for text in speeches:
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                noun = ' '.join(c[0] for c in chunk)
                if noun not in noun_freq.keys():
                    noun_freq[noun] = 1
                else:
                    noun_freq[noun] += 1

ordered_noun_freq = dict(sorted(noun_freq.items(), key=lambda item: item[1], reverse = True))

with open(RESULTS_DIR + "/noun_freq.txt", "w") as f:
    for key in ordered_noun_freq.keys():
        f.write(key + " : " + str(ordered_noun_freq[key]) +"\n")