In [109]:
# import important libraries 
import pandas as pd
import numpy as np
from langdetect import detect
from nltk.corpus import words
import re
import string
import inflect
import nltk
from nltk.stem import PorterStemmer
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from gensim.models import Word2Vec
import torch
from transformers import BertTokenizer, BertModel
from rouge import Rouge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle

In [110]:
with open('../static/model/LR_model.pickle', 'rb') as f:
    model = pickle.load(f)

In [111]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [112]:
def replace_numbers(match):
    num = match.group()
    num_word = inflect_instance.number_to_words(num)
    return num_word

In [113]:
def rouge_features(generated_summary, reference_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores[0]['rouge-1']['f'], scores[0]['rouge-2']['f']

In [114]:
def sentence_level_metrics(summary):
    # Compute sentence-level metrics
    sentences = summary.split('. ')  # Split into sentences
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    readability_scores = [TextBlob(sentence).sentiment.polarity for sentence in sentences]
    return sentence_lengths, readability_scores

In [115]:
def text_embeddings(summary):
    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Tokenize the summary text
    tokens = tokenizer.tokenize(summary)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

    # Generate BERT-based embeddings
    with torch.no_grad():
        outputs = model(input_ids)
        word_embeddings = outputs.last_hidden_state.squeeze().numpy()

    return word_embeddings


In [116]:
def content_based_features(generated_summary, reference_summary):
    # Compute TF-IDF cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([generated_summary, reference_summary])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim

In [117]:
inflect_instance = inflect.engine()

# Regular expression pattern to find numbers in the text
pattern = r'\d+(\.\d+)?'
ps = PorterStemmer()
# nltk.download('punkt',download_dir='../static/punkt')

In [118]:
# Preprocessing
def preprocessing(input):
    
    cols = input.columns
    for col in cols:
        input[col] = input[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
        input[col] = input[col].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
        input[col] = input[col].apply(remove_punctuations)
        input[col] = input[col].apply(lambda x: re.sub(pattern, replace_numbers, x))
        input[col] = input[col].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
    return input

In [119]:
# Generate summary
def generate_summary(input):
    # Load pre-trained model and tokenizer
    model_name = "google/pegasus-large"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    generated_summaries = []
    
    # Loop through each row in the dataset
    for index, row in input.iterrows():
        prompt_text = row['prompt_txt']
        prompt_question = row['prompt_question']
        prompt_title = row['prompt_title']
        
        # Combine prompts
        input_text = f"{prompt_text} {prompt_question} {prompt_title}"
        
        # Tokenize input and generate summary
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(input_ids)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
        # Append the generated summary to the list
        generated_summaries.append(generated_summary)
    
    # Add the generated summaries to the dataset as a new column
    input['generated_summary'] = generated_summaries
    return input

In [120]:
def calculate_features(input):
    # Calculate features for each summary

    # Initialize lists to store feature values
    rouge_1_score = []
    rouge_2_score = []
    sentence_lengths = []
    readability_scores = []
    word_embeddings = []
    content_sim =[]
    cosine_similarities = []
    
    # Iterate through each row in the feature_dataset
    for index, row in input.iterrows():
        reference_summary = row['summary']
        generated_summary = row['generated_summary']
        
        # Calculate features using your existing functions
        rouge_score_1,rouge_score_2 = rouge_features(reference_summary, generated_summary)
        sentence_length, readability_score = sentence_level_metrics(reference_summary)
        embedding = text_embeddings(reference_summary)
        content_based_feature = content_based_features(reference_summary, generated_summary)
        
        # Calculate cosine similarity between embeddings
        cosine_sim = cosine_similarity([embedding.mean(axis=0)], [embedding.mean(axis=0)])[0][0]
        
        # Append feature values to lists
        rouge_1_score.append(rouge_score_1)
        rouge_2_score.append(rouge_score_2)
        sentence_lengths.append(sentence_length[0])
        readability_scores.append(readability_score[0])
        word_embeddings.append(embedding)
        content_sim.append(content_based_feature[0][0])
        cosine_similarities.append(cosine_sim)
    
    # Create a new DataFrame with calculated features
    feature_columns = ['rouge_1_score','rouge_2_score', 'sentence_length', 'readability_score', 'word_embedding','content_sim', 'cosine_similarity']
    features_df = pd.DataFrame(zip(rouge_1_score,rouge_2_score, sentence_lengths, readability_scores, word_embeddings, content_sim,cosine_similarities), columns=feature_columns)
    
    # Concatenate the original dataset and the calculated features
    final_dataset = pd.concat([input, features_df], axis=1)
    return final_dataset


In [121]:
def get_content_score(input):
    content = model.predict(input)
    return content[0]

In [130]:
prompt_question = "summarize at least three elements of an ideal tragedy as described by aristotle"
prompt_title = 'on tragedy'
prompt_txt = "chapter thirteen as the sequel to what has already been said we must proceed to consider what the poet should aim at and what he should avoid in constructing his plots and by what means the specific effect of tragedy will be produced a perfect tragedy should as we have seen be arranged not on the simple but on the complex plan it should moreover imitate actions which excite pity and fear this being the distinctive mark of tragic imitation it follows plainly in the first place that the change of fortune presented must not be the spectacle of a virtuous man brought from prosperity to adversity for this moves neither pity nor fear it merely shocks us nor again that of a bad man passing from adversity to prosperity for nothing can be more alien to the spirit of tragedy it possesses no single tragic quality it neither satisfies the moral sense nor calls forth pity or fear nor again should the downfall of the utter villain be exhibited a plot of this kind would doubtless satisfy the moral sense but it would inspire neither pity nor fear for pity is aroused by unmerited misfortune fear by the misfortune of a man like ourselves such an event therefore will be neither pitiful nor terrible there remains then the character between these two extremes — that of a man who is not eminently good and just yet whose misfortune is brought about not by vice or depravity but by some error of judgement or frailty he must be one who is highly renowned and prosperous — a personage like oedipus thyestes or other illustrious men of such families a wellconstructed plot should therefore be single in its issue rather than double as some maintain the change of fortune should be not from bad to good but reversely from good to bad it should come about as the result not of vice but of some great error or frailty in a character either such as we have described or better rather than worse the practice of the stage bears out our view at first the poets recounted any legend that came in their way now the best tragedies are founded on the story of a few houses — on the fortunes of alcmaeon oedipus orestes meleager thyestes telephus and those others who have done or suffered something terrible a tragedy then to be perfect according to the rules of art should be of this construction hence they are in error who censure euripides just because he follows this principle in his plays many of which end unhappily it is as we have said the right ending the best proof is that on the stage and in dramatic competition such plays if well worked out are the most tragic in effect and euripides faulty though he may be in the general management of his subject yet is felt to be the most tragic of the poets in the second rank comes the kind of tragedy which some place first like the odyssey it has a double thread of plot and also an opposite catastrophe for the good and for the bad it is accounted the best because of the weakness of the spectators for the poet is guided in what he writes by the wishes of his audience the pleasure however thence derived is not the true tragic pleasure it is proper rather to comedy where those who in the piece are the deadliest enemies — like orestes and aegisthus — quit the stage as friends at the close and no one slays or is slain"
summary = "1 element of an ideal tragedy is that it should be arranged on a complex plan.  Another element of an ideal tragedy is that it should only have one main issue. The last element of an ideal tragedy is that it should have a double thread plot and an opposite catastrophe for both good and bad."

In [131]:
# creating a data frame
data = {'prompt_question': [prompt_question], 'prompt_title': [prompt_title], 'prompt_text': [prompt_txt], 'summary': [summary]}
input = pd.DataFrame(data)

In [132]:
preprocessed_input = preprocessing(input)

In [137]:
generated_summ = pd.read_csv('../artifacts/generated_summary.csv')

if prompt_title == 'on tragedy' and prompt_question== "summarize at least three elements of an ideal tragedy as described by aristotle":
    item_to_extract = generated_summ.at[0, 'generated_summary']
    preprocessed_input['generated_summary'] = item_to_extract
    input_with_sumary = preprocessed_input
elif prompt_title == 'egyptian social structure' and prompt_question== "in complete sentences summarize the structure of the ancient egyptian system of government how were different social classes involved in this government cite evidence from the text":
    item_to_extract = generated_summ.at[1, 'generated_summary']
    preprocessed_input['generated_summary'] = item_to_extract
    input_with_sumary = preprocessed_input
elif prompt_title == 'the third wave' and prompt_question== "summarize how the third wave developed over such a short period of time and why the experiment was ended":
    item_to_extract = generated_summ.at[2, 'generated_summary']
    preprocessed_input['generated_summary'] = item_to_extract
    input_with_sumary = preprocessed_input
elif prompt_title == 'excerpt from the jungle' and prompt_question== "summarize the various ways the factory would use or cover up spoiled meat cite evidence in your answer":
    item_to_extract = generated_summ.at[3, 'generated_summary']
    preprocessed_input['generated_summary'] = item_to_extract
    input_with_sumary = preprocessed_input
else:
    input_with_sumary = generate_summary(preprocessed_input)

In [134]:
feature_vals = calculate_features(input_with_sumary)

In [135]:
content = get_content_score(feature_vals.drop(columns=['prompt_question','prompt_title','prompt_text','summary','generated_summary','word_embedding']))

In [138]:
# content