### Objective of the Project

* The goal of text summarization is to create a shorter version of a longer text while retaining its key information and main ideas. 
* This can be useful in many scenarios, such as when someone needs to quickly understand the main points of an article, or when large amounts of text need to be processed efficiently. 
* In the case of Hindi articles, summarization can help people who are not fluent in Hindi to quickly grasp the essence of the text. 
* Additionally, summarization can be helpful for people who are short on time and want to get an overview of a longer text without having to read the entire thing.

### Why this Application

* Time-Saving
* Efficient Processing
* Increases productivity

### Model Considered

1. TF-IDF
2. TEXT-RANK

### Importing the libraries

In [46]:
import nltk  
import math
import re
import operator
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import streamlit as st

### Preprocessing

In [2]:
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
Stopwords = set(stopwords.words('hindi'))
wordlemmatizer = WordNetLemmatizer()

In [3]:
def lemmatize_words(words):
    lemmatized_words = []
    for word in words:
        lemmatized_words.append(wordlemmatizer.lemmatize(word))
    return lemmatized_words

In [4]:
def remove_special_characters(text):
    symbol_list = r'[^a-zA-Z0-9\s]'
    text = re.sub(symbol_list,'',text)
    return text

In [5]:
def freq(words):
    words = [word.lower() for word in words]
    dict_freq = {}
    words_unique = []
    for word in words:
        if word not in words_unique:
            words_unique.append(word)
    for word in words_unique:
        dict_freq[word] = words.count(word)
    return dict_freq

In [6]:
from nltk.tag import tnt
from nltk.corpus import indian
train_data = indian.tagged_sents('hindi.pos')
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)

def pos_tagging(article):
    
    words = nltk.word_tokenize(article)
    pos_tagged_words = tnt_pos_tagger.tag(words)
    pos_tagged_noun_verb = []
    
    for word, tag in pos_tagged_words:
        if tag == "NN" or tag == 'NNP' or tag == 'VM' or tag == 'VAUX':
            pos_tagged_noun_verb.append(word)
    return pos_tagged_noun_verb

### TF-IDF

In [7]:
def tf_score(word,sentence):
    freq_sum = 0
    word_frequency_in_sentence = 0
    len_sentence = len(sentence)
    for word_in_sentence in sentence.split():
        if word == word_in_sentence:
            word_frequency_in_sentence = word_frequency_in_sentence + 1
    tf =  word_frequency_in_sentence/ len_sentence
    return tf

In [8]:
def idf_score(no_of_sentences,word,sentences):
    no_of_sentence_containing_word = 0
    for sentence in sentences:
        sentence = remove_special_characters(str(sentence))
        sentence = re.sub(r'\d+', '', sentence)
        sentence = sentence.split()
        sentence = [word for word in sentence if word.lower() not in Stopwords and len(word)>1]
        sentence = [word.lower() for word in sentence]
        sentence = [wordlemmatizer.lemmatize(word) for word in sentence]
        if word in sentence:
            no_of_sentence_containing_word = no_of_sentence_containing_word + 1
    idf = math.log10(no_of_sentences/no_of_sentence_containing_word)
    return idf

In [9]:
def word_tfidf(dict_freq,word,sentences,sentence):
    word_tfidf = []
    tf = tf_score(word,sentence)
    idf = idf_score(len(sentences),word,sentences)
    return tf*idf

In [10]:
def sentence_importance(sentence,dict_freq,sentences):
    sentence_score = 0
    sentence = remove_special_characters(str(sentence)) 
    sentence = re.sub(r'\d+', '', sentence)
    pos_tagged_sentence = [] 
    no_of_sentences = len(sentences)
    pos_tagged_sentence = pos_tagging(sentence)
    for word in pos_tagged_sentence:
        if word.lower() not in Stopwords and word not in Stopwords and len(word)>1: 
            word = word.lower()
            word = wordlemmatizer.lemmatize(word)
            sentence_score = sentence_score + word_tfidf(dict_freq,word,sentences,sentence)
    return sentence_score

### Implementation

In [45]:
uploaded_file = st.file_uploader("Upload a Hindi text file (.txt)", type="txt")

if uploaded_file is not None:
    file = open(uploaded_file, 'r',encoding='utf-8')
    text = file.read()
    tokenized_sentence = sent_tokenize(text)
    # text = remove_special_characters(str(text)
    text = re.sub(r'\d+', '', text) 

In [12]:
tokenized_words_with_stopwords = word_tokenize(text)
tokenized_words = [word for word in tokenized_words_with_stopwords if word not in Stopwords]
tokenized_words = [word for word in tokenized_words if len(word) > 1]
tokenized_words = [word.lower() for word in tokenized_words]
tokenized_words = lemmatize_words(tokenized_words)

In [27]:
word_freq = freq(tokenized_words)
print(word_freq)
input_user = int(input('Percentage of information to retain(in percent):'))
no_of_sentences = int((input_user * len(tokenized_sentence))/100)
print(no_of_sentences)

{'उत्तर': 2, 'प्रदेश': 2, 'उत्तराखंड': 4, 'विधानसभा': 3, 'चुनाव': 5, 'के': 15, 'बीजेपी': 10, 'सोमवार': 1, 'उम्मीदवारों': 7, 'ऐलान': 7, 'रविवार': 1, 'केंद्रीय': 1, 'समिति': 1, 'बैठक': 3, 'नामों': 3, 'मुहर': 1, 'लगाई': 1, 'सूत्रों': 1, 'अनुसार': 1, 'यूपी': 4, 'नाम': 2, 'खबर': 1, 'थोड़ी': 1, 'देरी': 1, 'अमित': 1, 'शाह': 1, 'आवास': 1, 'चल': 2, 'सीनियर': 1, 'नेता': 2, 'मौजूद': 1, 'मीडिया': 1, 'रिपोर्ट्स': 1, 'मुताबिक': 1, 'जनवरी': 2, 'चाह': 1, 'पार्टी': 3, 'प्रत्याशियों': 1, 'पीछे': 1, 'सपा': 1, 'बसपा': 1, 'एक-एक': 1, 'लिस्ट': 4, 'जारी': 3, 'तैयारी': 1, 'पंजाब': 6, 'गोवा': 5, 'मणिपुर': 2, 'सीटों': 7, 'चरणों': 2, 'वोटिंग': 1, 'होगी': 1, 'पहला': 1, 'चरण-': 7, 'सीट': 3, 'फरवरी': 7, 'तीसरा': 1, 'चौथा': 1, 'पांचवा': 1, 'छठा': 1, 'मार्च': 4, 'सातवां': 1, 'उत्तराखंड-': 1, 'वोट': 1, 'डाले': 1, 'जाएंगे': 1, 'अलावा': 1, 'होंगे': 1, 'मतदान': 1, 'नतीजे': 1, 'आएंगे': 1, 'पिछले': 1, 'दिनों': 1, 'चुनावों': 1, 'पहली': 3, 'जेपी': 1, 'नड्डा': 1, 'बता': 1, 'दें': 1, 'अकाली': 2, 'दल': 2, 'गठबंधन': 1, 'लड़ती': 2

In [28]:
c = 1
sentence_with_importance = {} 
for sent in tokenized_sentence:
    sentenceimp = sentence_importance(sent,word_freq,tokenized_sentence)
    sentence_with_importance[c] = sentenceimp
    c = c+1
sentence_with_importance = sorted(sentence_with_importance.items(), key=operator.itemgetter(1),reverse=True)

In [29]:
cnt = 0
sentence_no = []
for word_prob in sentence_with_importance:
    if cnt < no_of_sentences:
        sentence_no.append(word_prob[0])
        cnt = cnt+1
    else:
        break
sentence_no.sort()

In [43]:
count = 1
summary=""
for sentence in tokenized_sentence:
    if count in sentence_no:
        summary+= sentence+" "
    count+=1
# print("Summary:")
print(summary)

if summary:
        st.write("Summary:", summary)
else:
    st.write("not good.")

उत्तर प्रदेश और उत्तराखंड में होने वाले विधानसभा चुनाव के लिए बीजेपी सोमवार को उम्मीदवारों का ऐलान कर सकती है. रविवार को हुई बीजेपी की केंद्रीय चुनाव समिति की बैठक में नामों की मुहर लगाई गई है. सूत्रों के अनुसार यूपी के लिए बीजेपी 140 और उत्तराखंड के लिए 55 उम्मीदवारों के नाम का ऐलान कर सकती है. खबर है कि नामों के ऐलान में थोड़ी देरी हो सकती है. अमित शाह के आवास पर अभी एक और बैठक चल रही है. सभी सीनियर नेता बैठक में मौजूद हैं. मीडिया रिपोर्ट्स के मुताबिक, बीजेपी 20 जनवरी तक यूपी और उत्तराखंड के उम्मीदवारों का ऐलान करना चाह रही है. पार्टी पहले से ही प्रत्याशियों का नाम ऐलान करने में पीछे चल रही है. अभी तक यूपी में सपा और बसपा ने उम्मीदवारों का ऐलान कर दिया है. बीजेपी 17 और 19 जनवरी को भी एक-एक लिस्ट जारी करने की तैयारी में है. उत्तर प्रदेश, पंजाब, उत्तराखंड, गोवा, मणिपुर में
होने जा रहे है. यूपी की 403 सीटों पर 7 चरणों में वोटिंग होगी. 


2023-08-08 17:29:21.465 
  command:

    streamlit run C:\Users\khuts\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


### TEXT RANK ALGORITHM

GloVe word embeddings are vector representation of words. 
These word embeddings will be used to create vectors for our sentences. 
We could have also used the Bag-of-Words or TF-IDF approaches to create features for our sentences, 
but these methods ignore the order of the words (and the number of features is usually pretty large).

In [31]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [32]:
sentence_vectors = []
for i in tokenized_sentence:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split()))
    sentence_vectors.append(v)

In [33]:
# similarity matrix
sim_mat = np.zeros([len(tokenized_sentence), len(tokenized_sentence)])

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
for i in range(len(tokenized_sentence)):
    for j in range(len(tokenized_sentence)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))

In [36]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
print(scores)

{0: 0.041036900926599625, 1: 0.04116904367455, 2: 0.045617230260577984, 3: 0.039560860052399095, 4: 0.0402508435710731, 5: 0.038696211050302715, 6: 0.0474254506325869, 7: 0.04181086322499163, 8: 0.0415147123943797, 9: 0.04837242057915085, 10: 0.03869621105030271, 11: 0.04329102319294459, 12: 0.010507212444514618, 13: 0.047043266436092794, 14: 0.04119214464565863, 15: 0.048481279766895896, 16: 0.03405237945946809, 17: 0.04128922712086917, 18: 0.04141951453722617, 19: 0.04956604192272435, 20: 0.0415147123943797, 21: 0.04547521536518254, 22: 0.047242650358291643, 23: 0.04477458493883706}


In [37]:
scoresrank =  sorted(scores.items(), key=operator.itemgetter(1),reverse=True)
print(scoresrank)

[(19, 0.04956604192272435), (15, 0.048481279766895896), (9, 0.04837242057915085), (6, 0.0474254506325869), (22, 0.047242650358291643), (13, 0.047043266436092794), (2, 0.045617230260577984), (21, 0.04547521536518254), (23, 0.04477458493883706), (11, 0.04329102319294459), (7, 0.04181086322499163), (8, 0.0415147123943797), (20, 0.0415147123943797), (18, 0.04141951453722617), (17, 0.04128922712086917), (14, 0.04119214464565863), (1, 0.04116904367455), (0, 0.041036900926599625), (4, 0.0402508435710731), (3, 0.039560860052399095), (5, 0.038696211050302715), (10, 0.03869621105030271), (16, 0.03405237945946809), (12, 0.010507212444514618)]


In [38]:
# Specify number of sentences to form the summary
sn = 3
# Generate summary
for i in range(sn):
    print(tokenized_sentence[scoresrank[i][0]])

पंजाब के लिए पहली लिस्ट में 17 और गोवा के लिए 29 नामों का ऐलान किया गया.
गोवा और पंजाब में मतदान 4 फरवरी को है.
बीजेपी 17 और 19 जनवरी को भी एक-एक लिस्ट जारी करने की तैयारी में है.


### Future Improvement

* Deployment
* Improvement in algorithm