In [None]:
import nltk

In [None]:
nltk.download('genesis')

[nltk_data] Downloading package genesis to /root/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.


True

In [None]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [None]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

# Collocations

Overview

Collocations are expressions of multiple words which commonly co-occur. For example, the top ten bigram collocations in Genesis are listed below, as measured using Pointwise Mutual Information.

In [None]:
from nltk.collocations import *

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [None]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [None]:
fourgram_measures = nltk.collocations.QuadgramAssocMeasures()

In [None]:
# Downloads the data.
import nltk
nltk.download('stopwords')


# Using the stopwords.
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Finders

The collocations package provides collocation finders which by default consider all ngrams in a text as candidate collocations:



### DATA

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("database.csv")

In [None]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Speaker,Speech,Date,Type,Gender,Party,Category
0,0,SHRI GAURAV GOGOI,This is a country characterised by devotion. T...,"Saturday, February 10, 2024",Union Budget,M,Congress,Issue
1,1,SHRI MADDILA GURUMOORTHY,I would like to speak about Lord Rama and the...,"Saturday, February 10, 2024",Union Budget,M,YSR Congress,Call For Action
2,2,SHRI RAMPRIT MANDAL,I would like to express my gratitude to you f...,"Saturday, February 10, 2024",Union Budget,M,JDU,Call For Action
3,3,SHRI PRATAP CHANDRA SARANGI,Lord Ram is a symbol of dignity. Lord Shri Ra...,"Saturday, February 10, 2024",Union Budget,M,BJP,Appreciate
4,4,SHRI MALOOK NAGAR,Unprecedented work has been done during the 1...,"Saturday, February 10, 2024",Union Budget,M,BSP,Appreciate
5,5,SHRI HANS RAJ HANS,We are fortunate that we were born in India an...,"Saturday, February 10, 2024",Union Budget,M,BJP,Neutral
6,6,SHRI ARVIND SAWANT,I am very happy that Lord Rama is being discu...,"Saturday, February 10, 2024",Union Budget,M,Shiv Sena,Issue
7,7,SHRI SUNIL KUMAR PINTU,We were elected to the 17th Lok Sabha under t...,"Saturday, February 10, 2024",Union Budget,M,JDU,Call For Action
8,8,SHRI RAHUL RAMESH SHEWALE,"Before being elected to this Lok Sabha, we co...","Saturday, February 10, 2024",Union Budget,M,Shiv Sena,Appreciate
9,9,SHRI RAM MOHAN NAIDU KINJARAPU,I would like to congratulate our 140 crore In...,"Saturday, February 10, 2024",Union Budget,M,TDP,Neutral


In [None]:
df.Category.unique()

array(['Issue', 'Call For Action', 'Appreciate', 'Neutral', 'Blame'],
      dtype=object)

In [None]:
df['Category'].value_counts()

Appreciate         343
Call For Action    326
Issue              243
Blame              155
Neutral            121
Name: Category, dtype: int64

In [None]:
combined_speeches = df.groupby('Category')['Speech'].apply(' '.join).reset_index()

In [None]:
combined_speeches

Unnamed: 0,Category,Speech
0,Appreciate,Lord Ram is a symbol of dignity. Lord Shri Ra...
1,Blame,I stand against this Motion. I wonder how Lok...
2,Call For Action,I would like to speak about Lord Rama and the...
3,Issue,This is a country characterised by devotion. T...
4,Neutral,We are fortunate that we were born in India an...


In [None]:
combined_speeches['Speech'][0]

'Lord Ram is a symbol of dignity.  Lord Shri Ram is the ideal of supreme character all over the world. He is also a  symbol and ideal of social harmony and national unity. Lord Rama incarnated to  organize society. He gave respect to the forest dwellers belonging to the neglected  section of the society who joined his army and he displayed social harmony through  his conduct and awakened the consciousness of love for the people. Some anti-social  elements are trying to destroy the Sanatan values and create confusion. The king  worships the people in the rule of the Hon. Prime Minister. The poor, mothers, youth  and common people are worshipped here. Lord Shri Ramchandra is the epitome of  our faith. He is the identity, pride and self-respect of this nation. On the basis of all  the evidence, the Shri Ram temple has been constructed as per the decision of the  Hon’ble Court. This temple is our National temple and Lord Shri Ram is a symbol  of virtues. Matri Bhakti, Pitri Bhakti, Guru Bh

In [None]:
for i in range(0, 5):
  category = combined_speeches['Category'][i]
  text = combined_speeches['Speech'][i]
  print(category)

  tokens = nltk.wordpunct_tokenize(text)
  finder = BigramCollocationFinder.from_words(tokens)
  scored = finder.score_ngrams(bigram_measures.raw_freq)
  print("Bigram")
  print(sorted(bigram for bigram, score in scored))

  finder = TrigramCollocationFinder.from_words(tokens)
  finder = TrigramCollocationFinder.from_words(tokens, window_size=4)
  print("Trigram")
  print(sorted(finder.nbest(trigram_measures.raw_freq, 4)))
  print()

Output hidden; open in https://colab.research.google.com to view.

### Using contingency table values

While frequency counts make marginals readily available for collocation finding, it is common to find published contingency table values. The collocations package therefore provides a wrapper, ContingencyMeasures, which wraps an association measures class, providing association measures which take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the bigram case.

In [None]:
from nltk.metrics import ContingencyMeasures
cont_bigram_measures = ContingencyMeasures(bigram_measures)
print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740))
print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173))

95.29
1.55


# Category Wise Bigrams & Trigrams using TFIDF

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Importing libraries
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

# Input the file
# txt1 = ["Non-basmati rice is one of our important agricultural exports. India had exported 3.7 million tonnes of non-basmati rice in the year 2006-07 and 5.28 million tonnes in 2007-08. The Government does review the availability of food grains within the country on a continuing basis. There was a general rise in food grain prices in the latter half of the year 2007. On 15th October, 2007, the Government imposed a ban on the export of non-basmati rice. This decision was occasioned by the general rise in food grain prices and an overall assessment of the domestic supply position. The global prices of all types of rice continued to rise steeply over the next few months peaking in April-May 2008. Prices of rice nearly doubled in the international market in a space of less than six months. Escalating food prices created situations of extreme distress in many parts of the world especially in the Least Developed Countries (LDCs) and some poor African countries. The Governments of these African countries approached the Government of India with a request to sell them limited quantities of non-basmati rice. And that was, I may add here, a hundred per cent broken rice. Keeping in view our close international ties with these countries and the delicate supply position they were facing, it was decided on diplomatic considerations to authorize the export of limited quantities of non-basmati rice in relaxation of the ban. This was a Government decision. In pursuance of this decision, the Director-General of Foreign Trade (DGFT) had issued a notification on 24th January 2008 authorising the export of non-basmati rice of 50,000 Metric Tonnes (MT) to Madagascar through the State Trading Corporation of India (STC) and 25,000 MT to Comoros and 9,000 Metric tonnes to Mauritius through the MMTC Limited."]
# txt1 = [text]

# Preprocessing
def remove_string_special_characters(s):

    # removes special characters with ' '
    stripped = re.sub('[^a-zA-z\s]', '', s)
    stripped = re.sub('_', '', stripped)

    # Change any white space to one space
    stripped = re.sub('\s+', ' ', stripped)

    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
            return stripped.lower()

for i in range(0, 5):
  category = combined_speeches['Category'][i]
  print(category.upper())
  text = combined_speeches['Speech'][i]
  txt1 = [text]

  # Stopword removal
  stop_words = set(stopwords.words('english'))
  your_list = ['skills', 'ability', 'job', 'description']
  for i, line in enumerate(txt1):
      txt1[i] = ' '.join([x for
          x in nltk.word_tokenize(line) if
          ( x not in stop_words ) and ( x not in your_list )])

  # Getting trigrams
  vectorizer = CountVectorizer(ngram_range = (3,3))
  X1 = vectorizer.fit_transform(txt1)
  features = (vectorizer.get_feature_names_out())
  print("\n\nFeatures : \n", features)
  print("\nX1 : \n", X1.toarray())

  # Applying TFIDF
  vectorizer = TfidfVectorizer(ngram_range = (3,3))
  X2 = vectorizer.fit_transform(txt1)
  scores = (X2.toarray())
  print("\nScores : \n", scores)

  # Getting top ranking features
  sums = X2.sum(axis = 0)
  data1 = []
  for col, term in enumerate(features):
      data1.append( (term, sums[0,col] ))
  ranking = pd.DataFrame(data1, columns = ['term','rank'])
  words = (ranking.sort_values('rank', ascending = False))
  print ("\nWords head : \n", words.head(10))
  print()
  print()

APPRECIATE


Features : 
 ['00 00 00' '00 00 000' '00 000 crore' ... 'ziya panganak chebji'
 'zone set purvanchal' 'zone tax free']

X1 : 
 [[2 1 2 ... 1 1 1]]

Scores : 
 [[0.00718273 0.00359136 0.00718273 ... 0.00359136 0.00359136 0.00359136]]

Words head : 
                          term      rank
17754      hon prime minister  0.204708
5324       ble prime minister  0.122106
17690           hon ble prime  0.122106
20907      kisan samman nidhi  0.107741
9047   country the government  0.096967
31754           rs lakh crore  0.093375
40927        would like thank  0.082601
32036       sabka saath sabka  0.082601
32006       saath sabka vikas  0.079010
36231     the government also  0.075419


BLAME


Features : 
 ['000 atrocities schedule' '000 but english' '000 crore approved' ...
 'zone kannur calicut' 'zone uttar andhra' 'zones the capacity']

X1 : 
 [[1 1 1 ... 1 1 1]]

Scores : 
 [[0.00687793 0.00687793 0.00687793 ... 0.00687793 0.00687793 0.00687793]]

Words head : 
           

In [None]:
for i in range(0, 5):
  category = combined_speeches['Category'][i]
  print(category)
  text = combined_speeches['Speech'][i]
  txt1 = [text]

  # Stopword removal
  stop_words = set(stopwords.words('english'))
  your_list = ['skills', 'ability', 'job', 'description']
  for i, line in enumerate(txt1):
      txt1[i] = ' '.join([x for
          x in nltk.word_tokenize(line) if
          ( x not in stop_words ) and ( x not in your_list )])

  # Getting bigrams
  vectorizer = CountVectorizer(ngram_range =(2, 2))
  X1 = vectorizer.fit_transform(txt1)
  features = (vectorizer.get_feature_names_out())
  print("\n\nX1 : \n", X1.toarray())

  # Applying TFIDF
  # You can still get n-grams here
  vectorizer = TfidfVectorizer(ngram_range = (2, 2))
  X2 = vectorizer.fit_transform(txt1)
  scores = (X2.toarray())
  print("\n\nScores : \n", scores)

  # Getting top ranking features
  sums = X2.sum(axis = 0)
  data1 = []
  for col, term in enumerate(features):
      data1.append( (term, sums[0, col] ))
  ranking = pd.DataFrame(data1, columns = ['term', 'rank'])
  words = (ranking.sort_values('rank', ascending = False))
  print ("\n\nWords : \n", words.head(10))

Appreciate


X1 : 
 [[3 6 1 ... 1 1 1]]


Scores : 
 [[0.00503444 0.01006889 0.00167815 ... 0.00167815 0.00167815 0.00167815]]


Words : 
                  term      rank
28055  the government  0.505122
21932  pradhan mantri  0.265147
22169  prime minister  0.253400
21102        per cent  0.159424
31541      would like  0.145999
20301  our government  0.122505
16348      lakh crore  0.119148
13658         hon ble  0.104045
7610     crore rupees  0.099011
7146      country the  0.097333
Blame


X1 : 
 [[ 1  1 10 ...  1  1  1]]


Scores : 
 [[0.00426945 0.00426945 0.04269452 ... 0.00426945 0.00426945 0.00426945]]


Words : 
                      term      rank
13867      the government  0.426945
10373            per cent  0.345826
15548          would like  0.239089
14175           this bill  0.111006
2544   central government  0.106736
1179       andhra pradesh  0.102467
12349    scheduled tribes  0.102467
14193     this government  0.093928
7706        jammu kashmir  0.089658
13461    

### Year Wise Bigrams & Trigrams using TFIDF

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Speaker,Speech,Date,Type,Gender,Party,Category
0,0,SHRI GAURAV GOGOI,This is a country characterised by devotion. T...,"Saturday, February 10, 2024",Union Budget,M,Congress,Issue
1,1,SHRI MADDILA GURUMOORTHY,I would like to speak about Lord Rama and the...,"Saturday, February 10, 2024",Union Budget,M,YSR Congress,Call For Action
2,2,SHRI RAMPRIT MANDAL,I would like to express my gratitude to you f...,"Saturday, February 10, 2024",Union Budget,M,JDU,Call For Action
3,3,SHRI PRATAP CHANDRA SARANGI,Lord Ram is a symbol of dignity. Lord Shri Ra...,"Saturday, February 10, 2024",Union Budget,M,BJP,Appreciate
4,4,SHRI MALOOK NAGAR,Unprecedented work has been done during the 1...,"Saturday, February 10, 2024",Union Budget,M,BSP,Appreciate


In [None]:
# Convert date to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract year from the date
df['Year'] = df['Date'].dt.year

# Group speeches by year and category, and combine them into one string
combined_speeches_yearly = df.groupby(['Year', 'Category'])['Speech'].apply(' '.join).reset_index()

In [None]:
combined_speeches_yearly

Unnamed: 0,Year,Category,Speech
0,2019,Appreciate,Chit fund was envisaged for the help and supp...
1,2019,Blame,"At the outset, I would like to say that the e..."
2,2019,Call For Action,It has been observed that there was no specif...
3,2019,Issue,It has been a long standing problem to provid...
4,2019,Neutral,"We have more than 30,000 registered chit fund..."
5,2020,Call For Action,We are discussing the Major Port Autorities B...
6,2021,Appreciate,Our government has brought a serious reform t...
7,2021,Blame,This is a retrogressive amendment. This is ag...
8,2021,Call For Action,I oppose the introduction of this Bill. Variou...
9,2021,Issue,We had come after preparing for the Bill whic...


In [None]:
import nltk
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

# Example function to generate bigrams and trigrams for a given text
def generate_ngrams(text):
    tokens = nltk.wordpunct_tokenize(text)
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()

    # Bigrams
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigram_scored = bigram_finder.score_ngrams(bigram_measures.raw_freq)
    bigrams = [bigram for bigram, score in bigram_scored]

    # Trigrams
    trigram_finder = TrigramCollocationFinder.from_words(tokens)
    trigram_finder = TrigramCollocationFinder.from_words(tokens, window_size=4)
    trigram_scored = trigram_finder.score_ngrams(trigram_measures.raw_freq)
    trigrams = [trigram for trigram, score in trigram_scored]

    return bigrams, trigrams

# Loop through each category and year in combined_speeches
for category, year_group in combined_speeches_yearly.groupby(['Category', 'Year']):
    category_name, year = category
    speeches_for_category_and_year = year_group['Speech'].str.cat(sep=' ')

    # Generate bigrams and trigrams for speeches in this category and year
    bigrams, trigrams = generate_ngrams(speeches_for_category_and_year)

    # Store the results or perform further analysis
    print("Category:", category_name)
    print("Year:", year)
    print("Bigrams:", bigrams)
    print("Trigrams:", trigrams)
    print()

Output hidden; open in https://colab.research.google.com to view.

## Visualization

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Function to generate word cloud from a list of phrases
def generate_word_cloud(phrases):
    text = ' '.join([' '.join(phrase) for phrase in phrases])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Loop through each category and year in combined_speeches
for category, year_group in combined_speeches_yearly.groupby(['Category', 'Year']):
    category_name, year = category
    speeches_for_category_and_year = year_group['Speech'].str.cat(sep=' ')

    # Generate bigrams and trigrams for speeches in this category and year
    bigrams, trigrams = generate_ngrams(speeches_for_category_and_year)

    # Visualize bigrams using word cloud
    print("Category:", category_name)
    print("Year:", year)
    print("Bigrams:", bigrams)
    generate_word_cloud(bigrams)

    # Visualize trigrams using word cloud
    generate_word_cloud(trigrams)
    print()

Output hidden; open in https://colab.research.google.com to view.

## Similarity Analysis

In [None]:
combined_speeches_new = df.groupby(['Year', 'Category'])['Speech'].apply(' '.join).reset_index()

In [None]:
combined_speeches_new.head()

Unnamed: 0,Year,Category,Speech
0,2019,Appreciate,Chit fund was envisaged for the help and supp...
1,2019,Blame,"At the outset, I would like to say that the e..."
2,2019,Call For Action,It has been observed that there was no specif...
3,2019,Issue,It has been a long standing problem to provid...
4,2019,Neutral,"We have more than 30,000 registered chit fund..."


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example function to generate N-grams from text
def generate_ngrams(text, n=2):
    tokens = nltk.word_tokenize(text)
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(' '.join(tokens[i:i+n]))
    return ngrams

# Example function to compute Cosine similarity between speeches
def compute_similarity(speeches):
    # Initialize CountVectorizer to convert speeches into a matrix of token counts
    vectorizer = CountVectorizer(analyzer=generate_ngrams)
    speech_matrix = vectorizer.fit_transform(speeches)

    # Compute cosine similarity between speeches
    similarity_matrix = cosine_similarity(speech_matrix)
    return similarity_matrix

# Example usage:
# Assuming combined_speeches contains speeches combined based on both year and category

# Extract speeches for a specific category and year
category = 'Issue'
year = 2024
speeches_for_category_and_year = combined_speeches_new[(combined_speeches_new['Category'] == category) & (combined_speeches_new['Year'] == year)]['Speech']

# Compute similarity matrix for speeches in the given category and year
similarity_matrix = compute_similarity(speeches_for_category_and_year)

# Print similarity matrix
print("Cosine Similarity Matrix:")
print(similarity_matrix)

Cosine Similarity Matrix:
[[1.]]
