<a href="https://colab.research.google.com/github/ShradhaSood/AI_project/blob/main/ShradhaSood_Hindi_Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [1]:
import pandas as pd
import string
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Load the dataset

In [2]:
df = pd.read_csv('hindi_dataset.csv')

Remove the missing values

In [3]:
df.dropna(subset=['text'], inplace=True)

Convert all text to lower case

In [4]:
df['text'] = df['text'].apply(lambda x: x.lower() if type(x) == str else x)
print('Lowercased text:\n', df['text'].head())

Lowercased text:
 0    hindi news\r\nlocal\r\nharyana\r\npanipat\r\nb...
1    hindi news\r\nlocal\r\nharyana\r\nrohtak\r\nbr...
2    code of ethics for digital news websites\r\nth...
3    और देखें\r\nवीडियो\r\nour divisions\r\ncopyrig...
4    hindi news\r\nlocal\r\nharyana\r\nharyana kaus...
Name: text, dtype: object


Remove all punctuation

In [5]:
def remove_punct(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

df['text'] = df['text'].apply(remove_punct)
print('Text without punctuation:\n', df['text'].head())

Text without punctuation:
 0    hindi news\r\nlocal\r\nharyana\r\npanipat\r\nb...
1    hindi news\r\nlocal\r\nharyana\r\nrohtak\r\nbr...
2    code of ethics for digital news websites\r\nth...
3    और देखें\r\nवीडियो\r\nour divisions\r\ncopyrig...
4    hindi news\r\nlocal\r\nharyana\r\nharyana kaus...
Name: text, dtype: object


Remove all numerical values

In [6]:
def remove_nums(text):
    text = re.sub(r'\d+', '', text)
    return text

df['text'] = df['text'].apply(remove_nums)
print('Text without numerical values:\n', df['text'].head())

Text without numerical values:
 0    hindi news\r\nlocal\r\nharyana\r\npanipat\r\nb...
1    hindi news\r\nlocal\r\nharyana\r\nrohtak\r\nbr...
2    code of ethics for digital news websites\r\nth...
3    और देखें\r\nवीडियो\r\nour divisions\r\ncopyrig...
4    hindi news\r\nlocal\r\nharyana\r\nharyana kaus...
Name: text, dtype: object


Remove common non-sensical text

In [7]:
df['text'] = df['text'].str.replace('\n', '')
print('Text without common non-sensical text:\n', df['text'].head())

Text without common non-sensical text:
 0    hindi news\rlocal\rharyana\rpanipat\rbrij bhus...
1    hindi news\rlocal\rharyana\rrohtak\rbrij bhush...
2    code of ethics for digital news websites\rthe ...
3    और देखें\rवीडियो\rour divisions\rcopyright ©  ...
4    hindi news\rlocal\rharyana\rharyana kaushal vi...
Name: text, dtype: object


Tokenize the text

In [8]:
df['text'] = df['text'].apply(word_tokenize)
print('Tokenized text:\n', df['text'].head())

Tokenized text:
 0    [hindi, news, local, haryana, panipat, brij, b...
1    [hindi, news, local, haryana, rohtak, brij, bh...
2    [code, of, ethics, for, digital, news, website...
3    [और, देखें, वीडियो, our, divisions, copyright,...
4    [hindi, news, local, haryana, haryana, kaushal...
Name: text, dtype: object


Remove stop words

In [9]:
stop_words_file = 'stopwords.txt'

with open(stop_words_file, 'r', encoding='utf-8') as f:
    stop_words = f.read().splitlines()

def remove_stopwords(text):
    text = [word for word in text if word not in stop_words]
    return text

df['text'] = df['text'].apply(remove_stopwords)
print('Text without stop words:\n', df['text'].head())

Text without stop words:
 0    [hindi, news, local, haryana, panipat, brij, b...
1    [hindi, news, local, haryana, rohtak, brij, bh...
2    [code, of, ethics, for, digital, news, website...
3    [देखें, वीडियो, our, divisions, copyright, ©, ...
4    [hindi, news, local, haryana, haryana, kaushal...
Name: text, dtype: object


POS Tagging

In [12]:
import nltk
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
def pos_tagging(text):
    pos = pos_tag(text)
    return pos

df['pos_tagged'] = df['text'].apply(pos_tagging)
print('POS tagged text:\n', df['pos_tagged'].head())


POS tagged text:
 0    [(hindi, JJ), (news, NN), (local, JJ), (haryan...
1    [(hindi, JJ), (news, NN), (local, JJ), (haryan...
2    [(code, NN), (of, IN), (ethics, NNS), (for, IN...
3    [(देखें, NN), (वीडियो, VBZ), (our, PRP$), (div...
4    [(hindi, JJ), (news, NN), (local, JJ), (haryan...
Name: pos_tagged, dtype: object


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert list of strings to single string
text = [" ".join(t) for t in df['text']]

# Create TF-IDF vectorizer and transform text
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text)

print(tfidf_matrix.shape)



(1856, 10162)


Install indic-nlp-library

In [15]:
!pip install indic-nlp-library

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting indic-nlp-library
  Downloading indic_nlp_library-0.91-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl (12 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-1.2.0-py2.py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinxcontrib-jquery!=3.0.0,>=2.0.0 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.

Install summarizer

In [16]:
!pip install summarizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting summarizer
  Downloading summarizer-0.0.7.tar.gz (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.1/280.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: summarizer
  Building wheel for summarizer (setup.py) ... [?25l[?25hdone
  Created wheel for summarizer: filename=summarizer-0.0.7-py2.py3-none-any.whl size=284224 sha256=88f1c8a66433bb4ddbe52b4558dca21cc6fc114fd4106dad62b3757991db7b0f
  Stored in directory: /root/.cache/pip/wheels/20/bb/2d/1fe057c2f729818a5f28c312c3667e8b9d5cfd4af4a39895e7
Successfully built summarizer
Installing collected packages: summarizer
Successfully installed summarizer-0.0.7


Import important libraries

In [18]:
import os
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from collections import defaultdict
import csv

Set the language to hindi

In [19]:
LANGUAGE = "hindi"

Set the path to the CSV file

In [21]:
# set the path to the CSV file
DATA_FILE = "/content/hindi_dataset.csv"

# read in the CSV file
with open(DATA_FILE, encoding="utf8") as csvfile:
    reader = csv.reader(csvfile)
    next(reader) # skip the header row
    for row in reader:
        try:
            text = row[0] # assuming the text is in the first column of each row
        except IndexError:
            continue

        # tokenize the text into sentences and words
        sentences = sentence_tokenize.sentence_split(text, lang=LANGUAGE)
        words = [indic_tokenize.trivial_tokenize(sentence) for sentence in sentences]

        # create a frequency distribution of words
        freq = defaultdict(int)
        for sentence in words:
            for word in sentence:
                freq[word] += 1

        # calculate the score for each sentence
        scores = defaultdict(int)
        for i, sentence in enumerate(words):
            for word in sentence:
                scores[i] += freq[word]

        # sort the sentences by score and select the top 3
        top_sentences = sorted(scores, key=scores.get, reverse=True)[:3]
        summary = " ".join([sentences[i] for i in top_sentences])
        print(summary)



FIR न करने पर दिल्ली पुलिस को नोटिस, पहलवान बोले- शिकायत वापस लेने का दबाव, जान को खतरा
बोले- बबीता ने रिपोर्ट पढ़कर साइन किए; उत्पीड़न जैसी बात भी सामने नहीं आई
DNPA Code of Ethics For News Websites
Aaj Ka Rashifal: Daily Rashifal, Today's Rashifal, Aaj Ka Rashifal for EVERY Zodiac Sign
ACB की FIR में IAS दहिया का नाम; ऑस्ट्रेलिया दौरा रद, शिकायतकर्ता सीआईडी ADGP से मिला
Sports - Dainik Bhaskar
इसी ग्राउंड पर बनाए थे 183 रन, अजिंक्य रहाणे और रविंद्र जडेजा भी पहुंचे
घरेलू टीमें 57% मुकाबले हारीं; CSK ने चौंकाया, ऑरेंज कैप पर ओपनर्स का दबदबा
कहा- कुछ समय के लिए क्रिकेट से ब्रेक लें ताकि WTC के लिए खुद को फिट रख सकें
रहाणे-बटलर फाॅर्म में, अश्विन-तुषार दिला सकते हैं पाॅइंट्स
केंद्र ने कहा- बृजभूषण पर FIR से पहले जांच जरूरी, प्रियंका गांधी बोलीं- बहनें रो रहीं, आइए साथ दें
201 के टारगेट से 21 रन पीछे रही; कोहली के विकेट से पलटा मैच
हर्षल-सिराज ने छोड़े आसान कैच, वेंकटेश की डाइव ने पलटा मैच; देखें मोमेंट्स
BCCI कोरोना नियमों में कर सकती है बदलाव, 5 दिन रहना पड़ता है क्वारैंटाइन
हैमस्ट्रिंग इ