In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [3]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
df = pd.read_csv("database.csv")

In [6]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Speaker,Speech,Date,Type,Gender,Party,Category
0,0,SHRI GAURAV GOGOI,This is a country characterised by devotion. T...,"Saturday, February 10, 2024",Union Budget,M,Congress,Issue
1,1,SHRI MADDILA GURUMOORTHY,I would like to speak about Lord Rama and the...,"Saturday, February 10, 2024",Union Budget,M,YSR Congress,Call For Action
2,2,SHRI RAMPRIT MANDAL,I would like to express my gratitude to you f...,"Saturday, February 10, 2024",Union Budget,M,JDU,Call For Action
3,3,SHRI PRATAP CHANDRA SARANGI,Lord Ram is a symbol of dignity. Lord Shri Ra...,"Saturday, February 10, 2024",Union Budget,M,BJP,Appreciate
4,4,SHRI MALOOK NAGAR,Unprecedented work has been done during the 1...,"Saturday, February 10, 2024",Union Budget,M,BSP,Appreciate


In [7]:
df = df.drop('Unnamed: 0', axis=1)

In [8]:
df.head(5)

Unnamed: 0,Speaker,Speech,Date,Type,Gender,Party,Category
0,SHRI GAURAV GOGOI,This is a country characterised by devotion. T...,"Saturday, February 10, 2024",Union Budget,M,Congress,Issue
1,SHRI MADDILA GURUMOORTHY,I would like to speak about Lord Rama and the...,"Saturday, February 10, 2024",Union Budget,M,YSR Congress,Call For Action
2,SHRI RAMPRIT MANDAL,I would like to express my gratitude to you f...,"Saturday, February 10, 2024",Union Budget,M,JDU,Call For Action
3,SHRI PRATAP CHANDRA SARANGI,Lord Ram is a symbol of dignity. Lord Shri Ra...,"Saturday, February 10, 2024",Union Budget,M,BJP,Appreciate
4,SHRI MALOOK NAGAR,Unprecedented work has been done during the 1...,"Saturday, February 10, 2024",Union Budget,M,BSP,Appreciate


In [9]:
df.shape

(1188, 7)

In [10]:
def removeNumbers(data):
	return ''.join([i for i in data if not i.isdigit()])

def toLowerCase(data):
	return data.lower()

def removeWhiteSpace(data):
	return data.replace(" ","")

def removeNewLine(data):
	return data.replace("\n","")

def getUpperCharacters(data):
	return ''.join(c for c in data if c.isupper())

def removeSpecialCharacters(data):
	data = re.sub('[^a-zA-Z0-9 \n\.]', '', data)
	return data

In [11]:
df['Preprocessed_Speech'] = df['Speech'].apply(toLowerCase)
df['Preprocessed_Speech'] = df['Preprocessed_Speech'].apply(removeSpecialCharacters)
df['Preprocessed_Speech'] = df['Preprocessed_Speech'].apply(removeNewLine)
df['Preprocessed_Speech'] = df['Preprocessed_Speech'].apply(removeNumbers)

In [12]:
df.head(5)

Unnamed: 0,Speaker,Speech,Date,Type,Gender,Party,Category,Preprocessed_Speech
0,SHRI GAURAV GOGOI,This is a country characterised by devotion. T...,"Saturday, February 10, 2024",Union Budget,M,Congress,Issue,this is a country characterised by devotion. t...
1,SHRI MADDILA GURUMOORTHY,I would like to speak about Lord Rama and the...,"Saturday, February 10, 2024",Union Budget,M,YSR Congress,Call For Action,i would like to speak about lord rama and the...
2,SHRI RAMPRIT MANDAL,I would like to express my gratitude to you f...,"Saturday, February 10, 2024",Union Budget,M,JDU,Call For Action,i would like to express my gratitude to you f...
3,SHRI PRATAP CHANDRA SARANGI,Lord Ram is a symbol of dignity. Lord Shri Ra...,"Saturday, February 10, 2024",Union Budget,M,BJP,Appreciate,lord ram is a symbol of dignity. lord shri ra...
4,SHRI MALOOK NAGAR,Unprecedented work has been done during the 1...,"Saturday, February 10, 2024",Union Budget,M,BSP,Appreciate,unprecedented work has been done during the t...


In [13]:
df['Preprocessed_Speech'][1]

'i would like to speak about lord  rama and the importance of the eternal teachings of lord rama. as depicted in the  ramayana his reign termed as ram rajya symbolized good governance and  epitomised justice welfare and happiness for all. the leadership of lord rama was  dedicated to fairness compassion and the welfare of his people above all else. the  lord venkateshwara temple in my parliamentary constituency at tirupati is one of  the most famous temples in our country. anjanadri hill one of the seven hills is  supposed to be the birthplace of hanuman. another location lepakshi is referenced  in the ramaana which stands as a testament to the widespread reverence of lord  ramas legacy.  by developing these sites we will enhance indias cultural heritage  tourism as well as reinforce the bonds of unity and spirituality inspired by lord  rama. we request the government for comprehensive development and better  connectivity to all religious sites nationwide to further enrich our cultural

## NLP Pre-Processing for Model Training

In [14]:
from nltk import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

In [15]:
df['tokenized_text'] = df['Preprocessed_Speech'].apply(word_tokenize)

In [16]:
df.head()

Unnamed: 0,Speaker,Speech,Date,Type,Gender,Party,Category,Preprocessed_Speech,tokenized_text
0,SHRI GAURAV GOGOI,This is a country characterised by devotion. T...,"Saturday, February 10, 2024",Union Budget,M,Congress,Issue,this is a country characterised by devotion. t...,"[this, is, a, country, characterised, by, devo..."
1,SHRI MADDILA GURUMOORTHY,I would like to speak about Lord Rama and the...,"Saturday, February 10, 2024",Union Budget,M,YSR Congress,Call For Action,i would like to speak about lord rama and the...,"[i, would, like, to, speak, about, lord, rama,..."
2,SHRI RAMPRIT MANDAL,I would like to express my gratitude to you f...,"Saturday, February 10, 2024",Union Budget,M,JDU,Call For Action,i would like to express my gratitude to you f...,"[i, would, like, to, express, my, gratitude, t..."
3,SHRI PRATAP CHANDRA SARANGI,Lord Ram is a symbol of dignity. Lord Shri Ra...,"Saturday, February 10, 2024",Union Budget,M,BJP,Appreciate,lord ram is a symbol of dignity. lord shri ra...,"[lord, ram, is, a, symbol, of, dignity, ., lor..."
4,SHRI MALOOK NAGAR,Unprecedented work has been done during the 1...,"Saturday, February 10, 2024",Union Budget,M,BSP,Appreciate,unprecedented work has been done during the t...,"[unprecedented, work, has, been, done, during,..."


In [17]:
lemmatiser = WordNetLemmatizer()
df["lemmatised_speech"] = df["tokenized_text"].apply(lambda tokens: [lemmatiser.lemmatize(token, pos='v') for token in tokens])

In [18]:
df[['tokenized_text', 'lemmatised_speech']]

Unnamed: 0,tokenized_text,lemmatised_speech
0,"[this, is, a, country, characterised, by, devo...","[this, be, a, country, characterise, by, devot..."
1,"[i, would, like, to, speak, about, lord, rama,...","[i, would, like, to, speak, about, lord, rama,..."
2,"[i, would, like, to, express, my, gratitude, t...","[i, would, like, to, express, my, gratitude, t..."
3,"[lord, ram, is, a, symbol, of, dignity, ., lor...","[lord, ram, be, a, symbol, of, dignity, ., lor..."
4,"[unprecedented, work, has, been, done, during,...","[unprecedented, work, have, be, do, during, th..."
...,...,...
1183,"[the, present, bill, seeks, to, remove, discre...","[the, present, bill, seek, to, remove, discrep..."
1184,"[the, supreme, court, of, india, in, its, land...","[the, supreme, court, of, india, in, its, land..."
1185,"[the, issue, is, being, discussed, very, serio...","[the, issue, be, be, discuss, very, seriously,..."
1186,"[the, government, has, the, bounden, duty, to,...","[the, government, have, the, bounden, duty, to..."


In [19]:
stop_words = set(stopwords.words('english'))
stop_words.add('.')

In [20]:
# Pre-processed field contains the tokens upon which tokenisation, lemmatisation and removal of punctuation and stop-words is applied
df['Preprocessed_Speech'] = df['lemmatised_speech']

In [21]:
for i in range(len(df)):
    df['Preprocessed_Speech'][i] = [ele for ele in df['Preprocessed_Speech'][i] if ele not in (stop_words)]

In [22]:
df.loc[0, 'Preprocessed_Speech']

['country',
 'characterise',
 'devotion',
 'country',
 'country',
 'faith',
 'country',
 'bind',
 'diversity',
 'goodwill',
 'cherish',
 'society',
 'sense',
 'service',
 'lord',
 'rama',
 'belong',
 'everyone',
 'us',
 'time',
 'meet',
 'exchange',
 'greet',
 'say',
 'ramram',
 'north',
 'india',
 'western',
 'india',
 'mahatma',
 'gandhis',
 'definition',
 'ram',
 'rajya',
 'everyone',
 'happy',
 'one',
 'sad',
 'definition',
 'gandhi',
 'ji',
 'also',
 'say',
 'hindu',
 'dharma',
 'teach',
 'respect',
 'religions',
 'secret',
 'ram',
 'rajya',
 'inherent',
 'definition',
 'ponder',
 'whether',
 'backward',
 'deprive',
 'minorities',
 'happy',
 'definition',
 'ram',
 'rajya',
 'mahatma',
 'gandhi',
 'today',
 'see',
 'crimes',
 'schedule',
 'cast',
 'schedule',
 'tribes',
 'rise',
 'compare',
 'year',
 'go',
 'crime',
 'go',
 'around',
 'percent',
 'year',
 'ram',
 'rajya',
 'today',
 'backward',
 'class',
 'demand',
 'caste',
 'census',
 'see',
 'injustice',
 'meted',
 'caste',
 'di

In [23]:
df[['Preprocessed_Speech']]

Unnamed: 0,Preprocessed_Speech
0,"[country, characterise, devotion, country, cou..."
1,"[would, like, speak, lord, rama, importance, e..."
2,"[would, like, express, gratitude, give, opport..."
3,"[lord, ram, symbol, dignity, lord, shri, ram, ..."
4,"[unprecedented, work, th, lok, sabha, leadersh..."
...,...
1183,"[present, bill, seek, remove, discrepancies, a..."
1184,"[supreme, court, india, landmark, judgment, kn..."
1185,"[issue, discuss, seriously, politics, country,..."
1186,"[government, bounden, duty, explain, house, re..."


## Model Training Based on only Speech

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
X = df.Preprocessed_Speech.astype(str)
y = df.Category.astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [26]:
categories = ['Call For Action', 'Blame', 'Appreciate', 'Issue', 'Neutral']

In [27]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

In [110]:
%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=categories))

accuracy 0.42857142857142855
                 precision    recall  f1-score   support

Call For Action       0.39      0.88      0.54        72
          Blame       0.00      0.00      0.00        27
     Appreciate       0.51      0.56      0.53        66
          Issue       0.67      0.04      0.07        56
        Neutral       0.00      0.00      0.00        17

       accuracy                           0.43       238
      macro avg       0.31      0.29      0.23       238
   weighted avg       0.42      0.43      0.33       238

CPU times: user 61.2 ms, sys: 760 µs, total: 62 ms
Wall time: 72.1 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [111]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

In [112]:
%%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=categories))

accuracy 0.5462184873949579
                 precision    recall  f1-score   support

Call For Action       0.69      0.78      0.73        72
          Blame       0.31      0.30      0.30        27
     Appreciate       0.53      0.59      0.56        66
          Issue       0.42      0.34      0.38        56
        Neutral       0.67      0.47      0.55        17

       accuracy                           0.55       238
      macro avg       0.52      0.49      0.50       238
   weighted avg       0.54      0.55      0.54       238

CPU times: user 39.2 ms, sys: 6 µs, total: 39.2 ms
Wall time: 41.2 ms


In [113]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [114]:
%%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=categories))

accuracy 0.542016806722689
                 precision    recall  f1-score   support

Call For Action       0.72      0.72      0.72        72
          Blame       0.21      0.19      0.20        27
     Appreciate       0.54      0.62      0.58        66
          Issue       0.43      0.41      0.42        56
        Neutral       0.62      0.47      0.53        17

       accuracy                           0.54       238
      macro avg       0.50      0.48      0.49       238
   weighted avg       0.54      0.54      0.54       238

CPU times: user 45.8 ms, sys: 960 µs, total: 46.8 ms
Wall time: 46.9 ms


In [28]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

In [29]:
train_size = int(len(df) * .7)
train_speech = df['Preprocessed_Speech'][:train_size]
train_cat = df['Category'][:train_size]

test_speech = df['Preprocessed_Speech'][train_size:]
test_cat = df['Category'][train_size:]

In [30]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_speech) # only fit on train

In [31]:
x_train = tokenize.texts_to_matrix(train_speech)
x_test = tokenize.texts_to_matrix(test_speech)

In [32]:
encoder = LabelEncoder()
encoder.fit(train_cat)
y_train = encoder.transform(train_cat)
y_test = encoder.transform(test_cat)

In [33]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [36]:
batch_size = 32
epochs = 10

In [46]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [47]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [40]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score)

Test accuracy: [1.4489682912826538, 0.4761904776096344]


In [49]:
loss, accuracy, f1_score, precision, recall = model.evaluate(x_test, y_test, verbose=0)

In [50]:
print(accuracy)
print(precision)
print(recall)
print(f1_score)

0.46498599648475647
0.5393968224525452
0.3838541507720947
0.44541704654693604


In [43]:
# predict probabilities for test set
y_pred = model.predict(x_test, verbose=0)
confusion_matrix(y_test,y_pred)
# print(classification_report(y_test, y_pred))

ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

## Model Training with Speech & Party Affiliation Both

In [92]:
cvec = CountVectorizer()

In [124]:
X = df.Preprocessed_Speech.astype(str)
y = df.Category.astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [125]:
cvec = CountVectorizer(stop_words = 'english').fit(X_train)

In [126]:
df_train = pd.DataFrame(cvec.transform(X_train).todense(),
                           columns = cvec.get_feature_names_out())
df_test = pd.DataFrame(cvec.transform(X_test).todense(),
                           columns = cvec.get_feature_names_out())

In [127]:
print(df_train.shape)
print(y_train.shape)
print(df_test.shape)
print(y_test.shape)

(831, 8089)
(831,)
(357, 8089)
(357,)


In [128]:
lr = LogisticRegression()
lr.fit(df_train, y_train)
lr.score(df_test, y_test)

0.5182072829131653

In [129]:
X = df.Party.astype(str)
y = df.Category.astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [130]:
party_train = pd.DataFrame(cvec.transform(X_train).todense(),
                           columns = cvec.get_feature_names_out())
party_test = pd.DataFrame(cvec.transform(X_test).todense(),
                           columns = cvec.get_feature_names_out())

In [131]:
train = pd.concat([df_train, party_train], axis = 1)
test = pd.concat([df_test, party_test], axis = 1)

print(train.shape)
print(y_train.shape)
print(test.shape)
print(y_test.shape)

(831, 16178)
(831,)
(357, 16178)
(357,)


In [132]:
lr = LogisticRegression()
lr.fit(train, y_train)
lr.score(test, y_test)

0.5042016806722689

In [135]:
from sklearn import model_selection, naive_bayes, svm
Naive = naive_bayes.MultinomialNB()
Naive.fit(train,y_train)

predictions_NB = Naive.predict(test)
# Use accuracy_score function to get the accuracy

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

Naive Bayes Accuracy Score ->  44.81792717086835
