In [1]:
%pwd

'd:\\Pavan\\ScriboSense\\notebooks\\Newer Versions'

In [2]:
%pip install rouge transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import nltk
import numpy as np
import pandas as pd
import matplotlib as plt
import inflect
import re
from nltk.stem import PorterStemmer
import string
from rouge import Rouge
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BertTokenizer, BertModel
import pickle
import sentencepiece
from gensim.models import Word2Vec

In [None]:
# with open('../static/model/LR_model.pickle', 'rb') as f:
#     model = pickle.load(f)

In [3]:
# load datasetss
prompts_train = pd.read_csv('../../artifacts/prompts_train.csv')
summaries_train = pd.read_csv('../../artifacts/summaries_train.csv')
prompts_test = pd.read_csv('../../artifacts/prompts_test.csv')
summaries_test = pd.read_csv('../../artifacts/summaries_test.csv')
sample_submission = pd.read_csv('../../artifacts/sample_submission.csv')

In [4]:
# Preprocessing
# --------------------------------------------------
# Regular expression pattern to find numbers in the text
pattern = r'\d+(\.\d+)?'
ps = PorterStemmer()
inflect_instance = inflect.engine()
def replace_numbers(match):
    num = match.group()
    num_word = inflect_instance.number_to_words(num)
    return num_word
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text
# ----------------------------------------------------

def preprocessing(input):

    cols = input.columns
    for col in cols:
        input[col] = input[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
        input[col] = input[col].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
        input[col] = input[col].apply(remove_punctuations)
        input[col] = input[col].apply(lambda x: re.sub(pattern, replace_numbers, x))
        input[col] = input[col].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
    return input

In [5]:
merged_train_ds = pd.merge(
    prompts_train,
    summaries_train,
    how="inner",
    on='prompt_id',
    sort=False,
    validate="one_to_many",
)

In [6]:
merged_train_ds = merged_train_ds.drop(columns=['prompt_id','student_id'])

In [7]:
preprocessed_texts = preprocessing(merged_train_ds.drop(columns=['content','wording']))

In [9]:
cols_to_convert=['prompt_question','prompt_text','text']
for column in cols_to_convert:
    merged_train_ds[column] = preprocessed_texts[column]

In [13]:
# calculating metrics
#----------------------------------------------
def rouge_features(generated_summary, reference_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores[0]['rouge-1']['f'], scores[0]['rouge-2']['f']
def sentence_level_metrics(summary):
    # Compute sentence-level metrics
    sentences = summary.split('. ')  # Split into sentences
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    readability_scores = [TextBlob(sentence).sentiment.polarity for sentence in sentences]
    return sentence_lengths, readability_scores
def text_embeddings(summary):
    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Tokenize the summary text
    tokens = tokenizer.tokenize(summary)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

    # Generate BERT-based embeddings
    with torch.no_grad():
        outputs = model(input_ids)
        word_embeddings = outputs.last_hidden_state.squeeze().numpy()

    return word_embeddings
def content_based_features(generated_summary, reference_summary):
    # Compute TF-IDF cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([generated_summary, reference_summary])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim
#----------------------------------------------

def calculate_features(input):
    # Calculate features for each summary

    # Initialize lists to store feature values
    rouge_1_score = []
    rouge_2_score = []
    sentence_lengths = []
    readability_scores = []
    # word_embeddings = []
    content_sim =[]
    cosine_similarities = []

    # Iterate through each row in the feature_dataset
    for index, row in input.iterrows():
        reference_summary = row['text']
        generated_summary = row['generated_summary']
        prompt_txt = row['prompt_text']

        # Calculate features using your existing functions
        rouge_score_1,rouge_score_2 = rouge_features(reference_summary, generated_summary)
        sentence_length, readability_score = sentence_level_metrics(reference_summary)
        # embedding = text_embeddings(reference_summary)
        content_based_feature = content_based_features(reference_summary, generated_summary)

        # Calculate cosine similarity between embeddings
        # cosine_sim = cosine_similarity([embedding.mean(axis=0)], [embedding.mean(axis=0)])[0][0]

        # Append feature values to lists
        rouge_1_score.append(rouge_score_1)
        rouge_2_score.append(rouge_score_2)
        sentence_lengths.append(sentence_length[0])
        readability_scores.append(readability_score[0])
        # word_embeddings.append(embedding)
        # cosine_similarities.append(cosine_sim)
        content_sim.append(content_based_feature[0][0])

    # Create a new DataFrame with calculated features
    feature_columns = ['rouge_1_score','rouge_2_score', 'sentence_length', 'readability_score','content_sim']
    features_df = pd.DataFrame(zip(rouge_1_score,rouge_2_score, sentence_lengths, readability_scores, content_sim), columns=feature_columns)

    # Concatenate the original dataset and the calculated features
    input.reset_index(drop=True,inplace=True)
    features_df.reset_index(drop=True,inplace=True)
    final_dataset = pd.concat([input, features_df], axis=1)
    return final_dataset

In [11]:
# Generate summary
def generate_summary(input):
    # Load pre-trained model and tokenizer
    model_name = "sshleifer/distilbart-cnn-12-6"
    tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
    model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

    generated_summaries = []

    # Loop through each row in the dataset
    for index, row in input.iterrows():
        prompt_text = row['prompt_text']
        prompt_title = row['prompt_title']

        # Combine prompts
        input_text = f"{prompt_text} {prompt_title}"

        # Tokenize input and generate summary
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(input_ids)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Append the generated summary to the list
        generated_summaries.append(generated_summary)

    # Add the generated summaries to the dataset as a new column
    input['generated_summary'] = generated_summaries
    return input

In [None]:
# # predict the content score
# def get_content_score(input):
#     content = model.predict(input)
#     return content[0]

In [13]:
nltk.download('punkt',download_dir='../static/punkt')

[nltk_data] Downloading package punkt to ../static/punkt...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from sklearn.utils import shuffle
merged_train_ds = shuffle(merged_train_ds)
merged_train_ds

Unnamed: 0,prompt_question,prompt_title,prompt_text,text,content,wording
5564,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",paragraph two states that the spiled meat was ...,-0.548304,0.506755
6810,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",In line 14 it says that the spoiled meat would...,-1.547163,-1.461245
5421,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",The factory could alter the meat in their chem...,-0.670740,-0.540354
2007,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,A tragedy should include a downful of a man wh...,-0.002466,-0.045439
4218,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,The third wave started because people are very...,-0.173157,-0.128520
...,...,...,...,...,...,...
4297,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,"The Third Wave developed because the actions, ...",0.782609,1.424724
1037,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,"According to Aristotle, the first element of a...",3.013642,1.710313
5571,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",The Factory would try to cover up or flat out ...,2.141224,1.123777
6761,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",when the meat was actually spoiled they would ...,-0.093814,0.503833


In [9]:
merged_train_ds.head(10)

Unnamed: 0,prompt_question,prompt_title,prompt_text,text,content,wording
5564,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",paragraph two states that the spiled meat was ...,-0.548304,0.506755
6810,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",In line 14 it says that the spoiled meat would...,-1.547163,-1.461245
5421,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",The factory could alter the meat in their chem...,-0.67074,-0.540354
2007,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,A tragedy should include a downful of a man wh...,-0.002466,-0.045439
4218,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,The third wave started because people are very...,-0.173157,-0.12852
5080,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,People wanted to join the movement because it ...,-0.548304,0.506755
3455,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,The egyptian government was controlled ...,2.343624,0.322298
361,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,The whole plot will have the downfall of a cha...,-0.974242,-0.751414
5158,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,The third wave was able to develop so quickly ...,2.380461,3.146134
3602,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,Every body in ancient egypt had a job. The pha...,-0.002466,-0.045439


In [14]:
batch_size = 150
batch_ds_list = []
for i in range(0, len(merged_train_ds.head(3)), batch_size):
    batch = merged_train_ds[i:i+batch_size]
    train_ds_with_GenSummaries = generate_summary(batch)
    result = calculate_features(train_ds_with_GenSummaries)
    batch_ds_list.append(result)

# Concatenate the returned datasets
# final_dataset = pd.concat(batch_ds_list, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input['generated_summary'] = generated_summaries
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input['generated_summary'] = generated_summaries
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input['generated_summary'] = generated_summaries


In [15]:
# Concatenate the returned datasets
final_dataset = pd.concat(batch_ds_list, ignore_index=True)

In [17]:
batch_ds_list[0]

Unnamed: 0,prompt_question,prompt_title,prompt_text,text,content,wording,generated_summary,rouge_1_score,rouge_2_score,sentence_length,readability_score,content_sim
0,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",paragraph two states that the spiled meat was ...,-0.548304,0.506755,With one member trimming beef in a cannery an...,0.133333,0.021739,20,-0.15,0.31611


In [18]:
final_dataset

Unnamed: 0,prompt_question,prompt_title,prompt_text,text,content,wording,generated_summary,rouge_1_score,rouge_2_score,sentence_length,readability_score,content_sim
0,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",paragraph two states that the spiled meat was ...,-0.548304,0.506755,With one member trimming beef in a cannery an...,0.133333,0.021739,20,-0.15,0.31611
1,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",In line 14 it says that the spoiled meat would...,-1.547163,-1.461245,With one member trimming beef in a cannery an...,0.061538,0.0,26,0.0,0.224873
2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",The factory could alter the meat in their chem...,-0.67074,-0.540354,With one member trimming beef in a cannery an...,0.179104,0.047059,39,-0.125,0.43178


In [94]:
final_dataset.to_csv('D:/Project/ScriboSense/artifacts/feature_scores_from_6950.csv',index=False)

In [96]:
# score_tabel=pd.read_csv('D:/Project/ScriboSense/artifacts/feature_scores_to_6950.csv')
# score_tabel

In [None]:
# train_ds_with_GenSummaries = generate_summary(merged_train_ds)
# feature_ds = calculate_features(train_ds_with_GenSummaries)
# content_score = get_content_score(feature_ds)

In [97]:
# train the dataset
model_dataset = final_dataset.drop(columns=['text','generated_summary','prompt_text'])

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

features = model_dataset[['rouge_1_score', 'rouge_2_score', 'sentence_length','readability_score','content_sim']]
target = model_dataset['content']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the testing data
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 0.34


In [104]:
# save the model
import pickle

with open('D:\Project/ScriboSense/static/model/v2.pickle', 'wb') as file:
    pickle.dump(model, file)

## Hyper parameter tuning

In [None]:
# # get the summary-content dataset
# referenceSummary_content = merged_train_ds.drop(columns=['prompt_question','prompt_text','wording'])
# promptText_content = merged_train_ds.drop(columns=['prompt_question','text','wording'])

In [None]:
# # tokenizing
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# def tokenize(dataset):
#     return tokenizer(dataset['text'], padding="max_length", truncation=True)
# def prompt_tokenize(dataset):
#     return tokenizer(dataset['prompt_text'], padding="max_length", truncation=True)


# merged_train_ds['tokenized_text'] = referenceSummary_content.apply(tokenize,axis=1)
# merged_train_ds['tokenized_prompt_text'] = promptText_content.apply(prompt_tokenize,axis=1)

  from .autonotebook import tqdm as notebook_tqdm
