In [63]:
#importing necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import heapq
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv("./Data/short_reviews.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [64]:
# Step 1: Sentence Segmentation
#seperating sentences using spacy (punctuation marks work like delimitters. Sentences just maintains them as seperate sentences
def process_text(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences[0]

In [65]:
# step 2: Tokenization
def tokenize_text(text):
    text = process_text(text)
    tokenized_sents = nltk.sent_tokenize(text)
    tokenized_sents = [nltk.word_tokenize(sent) for sent in tokenized_sents]
    tokenized_sents = [remove_stop_words(tokens) for tokens in tokenized_sents]

    return tokenized_sents

In [66]:
# Step 3: removing stop words according to Buckely list of stop words

#for which we ned to first take the buckley stopword list as a list
with open("./Data/Buckley-Salton-stopword-list.txt", "r") as words:
	lines = words.readlines()
stop_words =[]
for words in lines:
    as_list = words.split(",")
    stop_words.append(as_list[0].replace("\n", ""))

In [67]:
def remove_stop_words(text):
    return [word for word in text if word.lower() not in stop_words]

In [68]:
# Step 4:  Extracting sentences 
def F_first_K_Sents(text, f):
    return text[:f]

# selecting only important parts of sentences and making a score
def SelectImpSentences(scores, N, sentences):
    selected_indices = heapq.nlargest(N, scores, key=scores.get)
    selected_sentences = [sentences[i] for i in selected_indices]
    return [sentence for sublist in selected_sentences for sentence in sublist]

In [69]:
# Step 5: combining all fucntions and finding the score of best sentences
def extractiveApproach(dataset, f, N):
    output_dataset = []
    for input_text in dataset:
        important_sentences = []

        #calling the tokenization function
        tokenized_sents = tokenize_text(input_text)

        # calling the Sentence Extracting fucntion
        first_k_sents = F_first_K_Sents(tokenized_sents, f)
        
        #finding the best score words and sentences
        sentence_embeddings = [np.mean(np.array([token.vector for token in nlp(' '.join(tokens))]), axis=0) for tokens in first_k_sents]
        similarity_matrix = cosine_similarity(sentence_embeddings, sentence_embeddings)
        
        #findng the important sentence
        for i, sent in enumerate(first_k_sents):
            scores = {j: similarity_matrix[i][j] for j in range(len(first_k_sents))}
            important_sentences.append(sent)
        
        selected_sentences = SelectImpSentences(scores, N, first_k_sents)
        summary = ' '.join(selected_sentences)
        
        output_dataset.append(summary)
    
    return output_dataset

In [70]:
# Example usage
dataset = [
    "Hi,my name is Annarhysa Albert",
    "Mumbai is my favourite place",
    "Listen to me carefully"
]

f = 2  # Number of sentences to consider
N = 5  # Number of words per sentence

summarized_dataset = extractiveApproach(dataset, f, N)
for summary in summarized_dataset:
    print(summary)

, Annarhysa Albert
Mumbai favourite place
Listen carefully


In [71]:
summarized_dataset = extractiveApproach(df['review'], f, N)
df["Summary"] = summarized_dataset
df.head()

Unnamed: 0,review,sentiment,Summary
0,One of the other reviewers has mentioned that ...,positive,reviewers mentioned watching 1 Oz episode 'll ...
1,A wonderful little production. The filming tec...,positive,wonderful production .
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful spend time hot summer weeken...
3,Basically there's a family where a little boy ...,negative,Basically 's family boy ( Jake ) thinks 's zom...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei 's `` Love Time Money '' visuall...


In [66]:
#df.to_csv("Sumarized.csv", index = False)