In [17]:
# import important libraries 
import pandas as pd
import numpy as np
from langdetect import detect
from nltk.corpus import words
import re
import string
import inflect
import nltk
from nltk.stem import PorterStemmer
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from gensim.models import Word2Vec
import torch
from transformers import BertTokenizer, BertModel
from rouge import Rouge
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle

In [18]:
with open('../static/model/LR_model.pickle', 'rb') as f:
    model = pickle.load(f)

In [19]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [20]:
def replace_numbers(match):
    num = match.group()
    num_word = inflect_instance.number_to_words(num)
    return num_word

In [21]:
def rouge_features(generated_summary, reference_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores[0]['rouge-1']['f'], scores[0]['rouge-2']['f']

In [22]:
def sentence_level_metrics(summary):
    # Compute sentence-level metrics
    sentences = summary.split('. ')  # Split into sentences
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    readability_scores = [TextBlob(sentence).sentiment.polarity for sentence in sentences]
    return sentence_lengths, readability_scores

In [23]:
def text_embeddings(summary):
    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Tokenize the summary text
    tokens = tokenizer.tokenize(summary)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

    # Generate BERT-based embeddings
    with torch.no_grad():
        outputs = model(input_ids)
        word_embeddings = outputs.last_hidden_state.squeeze().numpy()

    return word_embeddings


In [24]:
def content_based_features(generated_summary, reference_summary):
    # Compute TF-IDF cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([generated_summary, reference_summary])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim

In [25]:
inflect_instance = inflect.engine()

# Regular expression pattern to find numbers in the text
pattern = r'\d+(\.\d+)?'
ps = PorterStemmer()
# nltk.download('punkt',download_dir='../static/punkt')

In [26]:
# Preprocessing
def preprocessing(input):
    
    cols = input.columns
    for col in cols:
        input[col] = input[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
        input[col] = input[col].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
        input[col] = input[col].apply(remove_punctuations)
        input[col] = input[col].apply(lambda x: re.sub(pattern, replace_numbers, x))
        input[col] = input[col].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
    return input

In [27]:
# Generate summary
def generate_summary(input):
    # Load pre-trained model and tokenizer
    model_name = "google/pegasus-large"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    generated_summaries = []
    
    # Loop through each row in the dataset
    for index, row in input.iterrows():
        prompt_text = row['prompt_txt']
        prompt_question = row['prompt_question']
        prompt_title = row['prompt_title']
        
        # Combine prompts
        input_text = f"{prompt_text} {prompt_question} {prompt_title}"
        
        # Tokenize input and generate summary
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(input_ids)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
        # Append the generated summary to the list
        generated_summaries.append(generated_summary)
    
    # Add the generated summaries to the dataset as a new column
    input['generated_summary'] = generated_summaries
    return input

In [28]:
def calculate_features(input):
    # Calculate features for each summary

    # Initialize lists to store feature values
    rouge_1_score = []
    rouge_2_score = []
    sentence_lengths = []
    readability_scores = []
    word_embeddings = []
    content_sim =[]
    cosine_similarities = []
    
    # Iterate through each row in the feature_dataset
    for index, row in input.iterrows():
        reference_summary = row['summary']
        generated_summary = row['generated_summary']
        
        # Calculate features using your existing functions
        rouge_score_1,rouge_score_2 = rouge_features(reference_summary, generated_summary)
        sentence_length, readability_score = sentence_level_metrics(reference_summary)
        embedding = text_embeddings(reference_summary)
        content_based_feature = content_based_features(reference_summary, generated_summary)
        
        # Calculate cosine similarity between embeddings
        cosine_sim = cosine_similarity([embedding.mean(axis=0)], [embedding.mean(axis=0)])[0][0]
        
        # Append feature values to lists
        rouge_1_score.append(rouge_score_1)
        rouge_2_score.append(rouge_score_2)
        sentence_lengths.append(sentence_length[0])
        readability_scores.append(readability_score[0])
        word_embeddings.append(embedding)
        content_sim.append(content_based_feature[0][0])
        cosine_similarities.append(cosine_sim)
    
    # Create a new DataFrame with calculated features
    feature_columns = ['rouge_1_score','rouge_2_score', 'sentence_length', 'readability_score', 'word_embedding','content_sim', 'cosine_similarity']
    features_df = pd.DataFrame(zip(rouge_1_score,rouge_2_score, sentence_lengths, readability_scores, word_embeddings, content_sim,cosine_similarities), columns=feature_columns)
    
    # Concatenate the original dataset and the calculated features
    final_dataset = pd.concat([input, features_df], axis=1)
    print(final_dataset)
    return final_dataset


In [29]:
def get_content_score(input):
    content = model.predict(input)
    return content

In [30]:
prompt_question = "aaaaa"
prompt_title = 'bbbbb'
prompt_txt = "ccccc"
summary = "dddd"

In [31]:
# creating a data frame
data = {'prompt_question': [prompt_question], 'prompt_title': [prompt_title], 'prompt_txt': [prompt_txt], 'summary': [summary]}
input = pd.DataFrame(data)

In [35]:
preprocessed_input = preprocessing(input)
input_with_sumary = generate_summary(preprocessed_input)
feature_vals = calculate_features(input_with_sumary)
content = get_content_score(feature_vals.drop(columns=['prompt_question','prompt_title','prompt_txt','summary','generated_summary','word_embedding']))
content

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  prompt_question prompt_title prompt_txt summary  generated_summary  \
0           aaaaa        bbbbb      ccccc    dddd  ccccc aaaaa bbbbb   

   rouge_1_score  rouge_2_score  sentence_length  readability_score  \
0            0.0            0.0                1                0.0   

                                      word_embedding  content_sim  \
0  [[0.32850066, 0.1855365, -0.14470264, -0.97448...          0.0   

   cosine_similarity  
0                1.0  


array([-3.6247952])

In [36]:
# input