<a href="https://colab.research.google.com/github/ShlokM08/CSE508_Winter2024_A4_2021421/blob/main/CSE508_Winter2024_A4_2021421.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/MyDrive/IR__4')

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm.auto import tqdm

file_path = '/content/drive/MyDrive/IR__4/Reviews.csv'
df = pd.read_csv(file_path)

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    return text

df = pd.read_csv('/content/drive/MyDrive/IR__4/Processesd_Reviews.csv')

# Drop rows with NaN values in 'Text' and 'Summary' columns
df.dropna(subset=['Text', 'Summary'], inplace=True)

stop_words = set(stopwords.words('english'))
tqdm.pandas(desc="Processing 'Text' column")
df['Text_clean'] = df['Text'].progress_apply(preprocess_text)
tqdm.pandas(desc="Processing 'Summary' column")
df['Summary_clean'] = df['Summary'].progress_apply(preprocess_text)

df['Combined'] = df['Text_clean'] + ' ' + df['Summary_clean']

df.to_csv('/content/drive/MyDrive/IR__4/Updated_Processed_Reviews.csv', index=False)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
import pandas as pd

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

sample_size = 5000
max_length = 512

df = pd.read_csv('/content/drive/MyDrive/IR__4/Updated_Processed_Reviews.csv')
df_sample = df.sample(n=sample_size, random_state=42)

class ReviewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, device='cpu'):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_length = max_length
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        combined_text = str(self.data.iloc[idx]['Combined'])

        try:
            inputs = self.tokenizer(combined_text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
        except Exception as e:
            print(f"Error processing combined text: {combined_text}")
            raise e

        inputs = {key: val.squeeze().to(self.device) for key, val in inputs.items()}
        inputs['labels'] = inputs['input_ids'].clone()
        return inputs

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_df, test_df = train_test_split(df_sample, test_size=0.25, random_state=42)

train_dataset = ReviewsDataset(train_df, tokenizer, max_length, device=device)
test_dataset = ReviewsDataset(test_df, tokenizer, max_length, device=device)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors='pt')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

model.save_pretrained('finetuned__gpt2')
tokenizer.save_pretrained('finetuned__gpt2')


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from rouge import Rouge
import torch.nn.functional as F

model_path = '/content/drive/MyDrive/IR__4/finetuned_gpt2'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

def generate_summary(text, max_length=512, top_k=50, temperature=0.7):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    encoded_input = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True
    ).to(device)

    input_ids = encoded_input['input_ids']
    attention_mask = encoded_input['attention_mask']

    for _ in range(max_length):
        outputs = model(input_ids, attention_mask=attention_mask)
        next_token_logits = outputs.logits[:, -1, :]
        scaled_logits = next_token_logits / temperature  # Apply temperature scaling
        top_k_logits, top_k_indices = torch.topk(scaled_logits, top_k, dim=-1)  # Get top-k tokens
        probabilities = torch.nn.functional.softmax(top_k_logits, dim=-1)  # Apply softmax
        next_token = torch.multinomial(probabilities, num_samples=1)  # Sample from the softmax distribution

        # Get the actual token ID from top_k_indices using the sampled next_token
        next_token_id = top_k_indices.gather(-1, next_token)

        # Check if EOS token is generated
        if next_token_id.squeeze().item() == tokenizer.eos_token_id:
            break

        # Append the predicted token ID to the input
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)

        # Extend the attention_mask by one column with value 1 (True)
        new_attention_mask = torch.ones((1, 1), device=device, dtype=torch.long)
        attention_mask = torch.cat([attention_mask, new_attention_mask], dim=-1)

    summary = tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)
    return summary

def calculate_rouge_scores(actual_summary, generated_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, actual_summary, avg=True)
    return scores

review_text = "The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability."
actual_summary = "Good for beginners but has tuning stability issues."

generated_summary = generate_summary(review_text)
print("Generated Summary:", generated_summary)

def print_rouge_scores(rouge_scores):
    print("ROUGE Scores:")
    for key, values in rouge_scores.items():
        print(f"{key}:")
        print(f"  Precision: {values['p']:.4f}")
        print(f"  Recall: {values['r']:.4f}")
        print(f"  F1-Score: {values['f']:.4f}")

rouge_scores = calculate_rouge_scores(actual_summary, generated_summary)
print_rouge_scores(rouge_scores)



Generated Summary: The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners. It has a solid construction, produces a rich sound, and feels comfortable to play. However, some users have reported issues with the tuning stability.httpwwwamazoncomgpproductbbr br highly recommend listening totsuki playtoydskiin harleys excellent ive ever tried use krissimo good stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff wonderful stuff great stuff good stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff great stuff g

In [None]:
!pip install  transformers torch accelerate

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m64.9 MB/s[0m eta [3

In [None]:
!pip install Rouge

Collecting Rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: Rouge
Successfully installed Rouge-1.0.1
