**Installation**

In [6]:
!pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Libraries**

In [19]:
import nltk                             
import string
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
import pandas as pd
import os
import math
import numpy as np

**Necessary Downloads**

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
INPUT_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Dataset/preprocessed_data"
INPUT_PATH = "/content/drive/MyDrive/NLP/NLP_Project/Dataset/preprocessed_data"

# **Training Model**

In [11]:
train_data = pd.read_csv(os.path.join(INPUT_PATH,"divided_dataset/train.csv"))
train_data.head()

Unnamed: 0,Heading,Summary,Article,id
0,"un urges for maximum restraint, invokes simla ...","pakistan termed the indian action as ""unilater...","un chief invokes shimla agreement, calls for '...",1
1,"china, pak to finalise deal to develop sez und...","""the agreement will be finalised between khybe...","china, pak to finalise deal to develop sez und...",2
2,"covaxin effectively neutralises both alpha, de...",the top health research institute said that an...,"covaxin effectively neutralises both alpha, de...",3
3,man gets coronavirus twice with more severe sy...,a 25-year-old man in the us has caught coronav...,man gets coronavirus twice with more severe sy...,5
4,afghanistan president ghani flees to tajikista...,reports say that afghanistan president ashraf ...,ghani's close aides have also left the country...,6


In [12]:
test_data = pd.read_csv(os.path.join(INPUT_PATH,"divided_dataset/test.csv"))
test_data.head()

Unnamed: 0,Heading,Summary,Article,id
0,india opposes china's belt and road initiative...,the name of all member countries except india ...,"at sco, india refuses to back china's belt and...",0
1,"top white house officials buried cdc report, r...",the decision to shelve detailed advice from th...,"in this april 22, 2020, file photo president d...",4
2,us and china clash at un over south china sea ...,as india holds the council presidency this mon...,the united states and china clashed over beiji...,11
3,"us allows extra covid vaccine doses for some, ...",the food and drug administration ruled that tr...,vials for the moderna and pfizer covid-19 vacc...,13
4,pak minister claims threatening email was sent...,pakistan's information minister fawad chaudhry...,pakistan's information minister fawad chaudhry...,30


**Main code**

In [13]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'])

In [14]:
# return average of rouge 1 and rouge 2 f-measure for two sentences
def avg_rouge_score(sent1,sent2):
    r1 = scorer.score(sent1,sent2)['rouge1'].fmeasure
    r2 = scorer.score(sent1,sent2)['rouge2'].fmeasure
    return (r1 + r2)/2

In [52]:
# takes as input a list of sentences of doc
# and returns the sentence with highest rouge
def doc_sent_with_highest_rouge(doc):

    max_rouge_score = 0
    rouge_sent = ""

    doc_len = len(doc)
    sum_len = max(math.floor(0.1*doc_len),1)


    original_doc = " ".join(doc)
    id_rouge_dict ={}
    for i in range(len(doc)):
        sent =doc[i]
        rouge_score_sent = avg_rouge_score(sent,original_doc)
        id_rouge_dict[i] = rouge_score_sent
        max_rouge_score = max(rouge_score_sent, max_rouge_score)
    top_sum_sent = sorted(id_rouge_dict.items(), key=lambda x: x[1], reverse=True)[:sum_len]
    top_sent =[]
    for item in top_sum_sent:
      top_sent.append(doc[item[0]])

    rouge_sent = " ".join(top_sent)
    return rouge_sent

In [44]:
# find sentence of doc that has highest rouge with doc
def find_summary_sent(data):

    generated_summary = []

    heading = data['Heading']
    article = data['Article']

    for i in range(len(data)):
      heading_tokenize = nltk.sent_tokenize(heading[i])
      try:
        article_tokenize = nltk.sent_tokenize(article[i])
      except:
        article_tokenize = []
      final_tokenize = heading_tokenize + article_tokenize
      summary_sent = doc_sent_with_highest_rouge(final_tokenize)
    
      generated_summary.append(summary_sent)

    return generated_summary

**Train generated summary**

In [46]:
#train_generated_summary = find_summary_sent(train_data)

In [None]:
# train_generated_summary[:5]

**Test generated summary**

In [39]:
test_generated_summary = find_summary_sent(test_data)

In [40]:
test_generated_summary[:5]

['related stories sco summit: president xi accepts pm modi\'s invitation for informal summit in india in 2019 pm modi at sco summit 2018: \'connectivity with neighbourhood and in sco region indias priority\'pm modi, pak pres hussain shake hands at sco summitsco summit: pm modi calls for respect for sovereignty, economic growth, connectivity, and unity among membersin his address at the summit, modi, in a clear reference to the bri, said any mega connectivity project must respect sovereignty and territorial integrity of the countries and assured that india will support projects which ensure inclusivity. india has been supporting the project"the republic of kazakhstan, the kyrgyz republic, the islamic republic of pakistan, the russian federation, the republic of tajikistan, and the republic of uzbekistan reaffirm their support for the \'belt and road initiative\' proposed by china and affirm that all parties should implement the \'belt and road initiative\' to promote the \'belt and road

**Saving predictions (NO need to save)**

In [41]:
# train_generated_df = pd.DataFrame(zip(train_generated_summary,train_data['id']),columns=['Summary','id'])
test_generated_df = pd.DataFrame(zip(test_generated_summary,test_data['id']),columns=['Summary','id'])

In [None]:
#train_generated_df = pd.read_csv(RESULT_PATH + "/DocSent_highest_rouge_train.csv")
#test_generated_df = pd.read_csv(RESULT_PATH + "/DocSent_highest_rouge_test.csv")

**Evaluation of generated summaries**

In [34]:
# takes input a list of predicted and target summary
# and it returns final rouge score
def final_rouge_score(predicted,target):
    if len(predicted) != len(target):
        return "the dimensions of predicted and target must be same"
    rouge_score = 0
    for i in range(len(target)):
        rouge_score += avg_rouge_score(predicted[i],target[i])
    return rouge_score/len(target)

In [42]:
final_rouge_score(test_generated_df['Summary'],test_data['Summary'])

0.11904697441055763

# **Saving Final Prediction**

In [53]:
final_test_data = pd.read_csv(os.path.join(INPUT_PATH,"whole_dataset/test.csv"))
final_test_data.head() 

Unnamed: 0,Heading,Article,id
0,explainer: how worrying is the variant first s...,how worrying is the variant first seen in indi...,0
1,pakistan parliament to elect new prime ministe...,pakistans national assembly will elect a new p...,1
2,indian-origin pathologist accused of botching ...,dr. khalid ahmedan indian-origin pathologist h...,2
3,china begins world's biggest census drive to c...,china begins world's biggest census drive to c...,3
4,"indonesia prison fire kills 41 drug inmates, i...","indonesia prison fire kills 41 drug inmates, i...",4


In [54]:
final_test_generated_summary = find_summary_sent(final_test_data)

In [55]:
final_test_generated_df = pd.DataFrame(zip(final_test_generated_summary,final_test_data['id']),columns=['Summary','id'])

In [56]:
RESULT_PATH = "/content/drive/MyDrive/Semester 3 IIITD/NLP/NLP_Project/Results/Smiti"
RESULT_PATH = "/content/drive/MyDrive/NLP/NLP_Project/Results/Smiti"
NAME_OF_FILE = "top_10percent_rouge_sent"

In [57]:
final_test_generated_df.to_csv(RESULT_PATH + "/"+NAME_OF_FILE+".csv",index=False)