In [None]:
pip install torch transformers nltk scikit-learn matplotlib
import pandas as pd

# Load the dataset
data = pd.read_csv('path_to_your_dataset.csv')
print(data.head())
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class ImpressionsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Create a prompt based on the columns
        prompt = f"Report Name: {self.data.iloc[idx]['Report Name']}\nHistory: {self.data.iloc[idx]['History']}\nObservation: {self.data.iloc[idx]['Observation']}\nImpressions:"
        return {'input_ids': tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=512).input_ids.squeeze(),
                'labels': tokenizer(self.data.iloc[idx]['Impressions'], return_tensors='pt', truncation=True, padding='max_length', max_length=512).input_ids.squeeze()}

# Load the tokenizer and model
model_name = 'gemma-2b-it'  # Choose the model based on your hardware
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Prepare the dataset
train_data = data.sample(n=300)
eval_data = data.drop(train_data.index)

train_dataset = ImpressionsDataset(train_data)
eval_dataset = ImpressionsDataset(eval_data)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_total_limit=1,
    save_strategy="epoch"
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
def generate_impressions(model, tokenizer, prompt):
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Generate impressions for evaluation samples
eval_samples = eval_data.copy()
eval_samples['Generated Impressions'] = eval_samples.apply(lambda row: generate_impressions(model, tokenizer, f"Report Name: {row['Report Name']}\nHistory: {row['History']}\nObservation: {row['Observation']}\nImpressions:"), axis=1)
print(eval_samples[['Report Name', 'Generated Impressions']])
from datasets import load_metric
import math

# Perplexity Calculation
def calculate_perplexity(model, tokenizer, text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        loss = model(input_ids, labels=input_ids).loss
    return math.exp(loss.item())

# ROUGE Calculation
rouge = load_metric("rouge")

# Calculate metrics
eval_samples['Perplexity'] = eval_samples.apply(lambda row: calculate_perplexity(model, tokenizer, row['Generated Impressions']), axis=1)

# Calculate ROUGE scores
rouge_results = rouge.compute(predictions=eval_samples['Generated Impressions'].tolist(), references=eval_samples['Impressions'].tolist())
print("ROUGE Results:", rouge_results)
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word.isalpha() and word not in stop_words]
    return words

# Apply preprocessing to the entire dataset
data['Processed Text'] = data['Observation'].apply(preprocess_text)

# Convert processed text into embeddings
def get_word_embeddings(text):
    return np.mean([model.encode(word) for word in text], axis=0)

data['Embeddings'] = data['Processed Text'].apply(get_word_embeddings)

# Find top 100 pairs of words based on similarity
word_pairs = []
for i, vec1 in enumerate(data['Embeddings']):
    for j, vec2 in enumerate(data['Embeddings']):
        if i != j:
            similarity = cosine_similarity([vec1], [vec2])[0][0]
            word_pairs.append((data['Processed Text'][i], data['Processed Text'][j], similarity))

# Sort and get top 100 pairs
top_pairs = sorted(word_pairs, key=lambda x: x[2], reverse=True)[:100]
import matplotlib.pyplot as plt

# Prepare data for visualization
words = [(pair[0], pair[1]) for pair in top_pairs]
similarities = [pair[2] for pair in top_pairs]

# Create a bar plot
plt.figure(figsize=(15, 7))
plt.barh(range(len(words)), similarities, color='skyblue')
plt.yticks(range(len(words)), [f'{pair[0]} - {pair[1]}' for pair in words])
plt.xlabel('Similarity Score')
plt.title('Top 100 Word Pairs Based on Similarity')
plt.show()
import plotly.express as px

# Create a DataFrame for interactive visualization
interactive_df = pd.DataFrame(top_pairs, columns=['Word1', 'Word2', 'Similarity'])
fig = px.scatter(interactive_df, x='Word1', y='Word2', size='Similarity', hover_name='Similarity', title='Top 100 Word Pairs')
fig.show()
