In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from rouge_score import rouge_scorer

In [2]:
game_descs = pd.read_csv('game_descs.csv')
game_reviews = pd.read_csv('testing.csv')

model_path = "./game_review_summarizer_final"


In [3]:
# Group reviews by GameId
grouped_reviews = game_reviews.groupby('GameId')

# Create a dataframe with one row per game
games_with_reviews = []
for game_id, group in grouped_reviews:
    # Only include games that have descriptions
    if game_id in game_descs['GameId'].values:
        # Get game description
        game_desc = game_descs[game_descs['GameId'] == game_id]['Long_Desc'].values[0]
        
        # Join all preprocessed reviews for this game
        # Handle both string representation of lists and actual lists
        all_reviews = []
        for review in group['Review']:
            if isinstance(review, str) and review.startswith('['):
                # Convert string representation of list to actual list
                review_tokens = eval(review)
            else:
                review_tokens = review
                
            if isinstance(review_tokens, list):
                all_reviews.append(" ".join(review_tokens))
            else:
                all_reviews.append(review_tokens)
                
        combined_reviews = " ".join(all_reviews)
        
        # Some summary statistics about the reviews
        positive_count = group['Positive_Review'].sum()
        total_count = len(group)
        
        games_with_reviews.append({
            'GameId': game_id,
            'combined_reviews': combined_reviews,
            'review_count': total_count,
            'positive_percent': (positive_count / total_count) * 100,
            'target_summary': game_desc  # Using game description as target summary
        })

games_df = pd.DataFrame(games_with_reviews)
print(f"Created dataset with {len(games_df)} unique games")

Created dataset with 9 unique games


In [4]:

# Split into train/val/test sets
train_df, temp_df = train_test_split(games_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training set: {len(train_df)} games")
print(f"Validation set: {len(val_df)} games")
print(f"Test set: {len(test_df)} games")

Training set: 7 games
Validation set: 1 games
Test set: 1 games


In [5]:

# Choose a suitable pre-trained model for summarization
model_name = "facebook/bart-large-cnn"  # Good for summarization
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [6]:


class GameReviewDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=1024, summary_max_length=256):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_mask = []
        self.labels = []
        
        for _, row in df.iterrows():
            # Prepare input: the combined reviews
            inputs = tokenizer(row['combined_reviews'], 
                              max_length=max_length, 
                              padding='max_length', 
                              truncation=True, 
                              return_tensors="pt")
            
            # Prepare target: the game description as summary
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(row['target_summary'], 
                                  max_length=summary_max_length, 
                                  padding='max_length', 
                                  truncation=True, 
                                  return_tensors="pt")
            
            self.input_ids.append(inputs.input_ids.flatten())
            self.attention_mask.append(inputs.attention_mask.flatten())
            self.labels.append(labels.input_ids.flatten())
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

# Create datasets
train_dataset = GameReviewDataset(train_df, tokenizer)
val_dataset = GameReviewDataset(val_df, tokenizer)



In [7]:


# Define metric computation
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    
    # Decode generated tokens to text
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    
    # Replace padding token id with tokenizer pad token id for decoding
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    # Calculate ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1 = rouge2 = rougeL = 0.0
    for p, l in zip(pred_str, label_str):
        scores = scorer.score(p, l)
        rouge1 += scores['rouge1'].fmeasure
        rouge2 += scores['rouge2'].fmeasure
        rougeL += scores['rougeL'].fmeasure
    
    rouge1 /= len(pred_str)
    rouge2 /= len(pred_str)
    rougeL /= len(pred_str)
    
    return {
        'rouge1': rouge1,
        'rouge2': rouge2, 
        'rougeL': rougeL
    }

# Set up training arguments
# Set up training arguments with matching evaluation and save strategies
training_args = Seq2SeqTrainingArguments(
    output_dir="./game_review_summarizer",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Change this to match evaluation_strategy
    learning_rate=3e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    generation_max_length=150,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
)

# Initialize trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [8]:
# Train the model
trainer.train()

# Save the model
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

  0%|          | 0/16 [00:00<?, ?it/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.834972858428955, 'eval_rouge1': 0.23841059602649006, 'eval_rouge2': 0.013333333333333332, 'eval_rougeL': 0.11258278145695363, 'eval_runtime': 2.9676, 'eval_samples_per_second': 0.337, 'eval_steps_per_second': 0.337, 'epoch': 1.0}




  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.8102211952209473, 'eval_rouge1': 0.2839506172839506, 'eval_rouge2': 0.04347826086956522, 'eval_rougeL': 0.10493827160493827, 'eval_runtime': 2.8351, 'eval_samples_per_second': 0.353, 'eval_steps_per_second': 0.353, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.8682940006256104, 'eval_rouge1': 0.27863777089783287, 'eval_rouge2': 0.037383177570093455, 'eval_rougeL': 0.10526315789473684, 'eval_runtime': 2.8327, 'eval_samples_per_second': 0.353, 'eval_steps_per_second': 0.353, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 3.9007396697998047, 'eval_rouge1': 0.26875, 'eval_rouge2': 0.025157232704402517, 'eval_rougeL': 0.09375, 'eval_runtime': 3.3817, 'eval_samples_per_second': 0.296, 'eval_steps_per_second': 0.296, 'epoch': 4.0}


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 114.3772, 'train_samples_per_second': 0.245, 'train_steps_per_second': 0.14, 'train_loss': 3.5011701583862305, 'epoch': 4.0}


('./game_review_summarizer_final\\tokenizer_config.json',
 './game_review_summarizer_final\\special_tokens_map.json',
 './game_review_summarizer_final\\vocab.json',
 './game_review_summarizer_final\\merges.txt',
 './game_review_summarizer_final\\added_tokens.json',
 './game_review_summarizer_final\\tokenizer.json')

In [17]:
def generate_summary_for_specific_game(game_id, dataset_df=None):
    """
    Generate a summary for a specific game ID from any dataset.
    If dataset_df is provided, it will look for the game there;
    otherwise it will search in all datasets (train, val, test).
    """
    # Determine which dataset contains the game
    source = "specified dataset"
    if dataset_df is not None:
        if game_id not in dataset_df['GameId'].values:
            return "Game ID not found in the specified dataset"
        game_data = dataset_df[dataset_df['GameId'] == game_id].iloc[0]
    else:
        # Look in train_df, val_df, and test_df
        if game_id in train_df['GameId'].values:
            game_data = train_df[train_df['GameId'] == game_id].iloc[0]
            source = "training set"
        elif game_id in val_df['GameId'].values:
            game_data = val_df[val_df['GameId'] == game_id].iloc[0]
            source = "validation set"
        elif game_id in test_df['GameId'].values:
            game_data = test_df[test_df['GameId'] == game_id].iloc[0]
            source = "test set"
        else:
            return "Game ID not found in any dataset"
    
    # Make sure model is in evaluation mode
    model.eval()
    
    # Tokenize the combined reviews
    inputs = tokenizer(game_data['combined_reviews'], 
                      return_tensors="pt", 
                      max_length=1024, 
                      truncation=True)
    
    # Send input to device
    device = next(model.parameters()).device  # Get the device model is on
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Generate summary
    with torch.no_grad():  # No need to track gradients for inference
        summary_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            num_beams=4,
            min_length=30,
            max_length=150,
            early_stopping=True
        )
    
    # Decode summary
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Get original reviews from game_reviews DataFrame
    original_reviews = game_reviews[game_reviews['GameId'] == game_id]
    
    # Sample a few reviews to display
    sampled_reviews = original_reviews.sample(min(3, len(original_reviews)))
    
    # Process review text safely
    sample_review_texts = []
    for _, review in sampled_reviews.iterrows():
        if 'Manual_Review' in review and isinstance(review['Manual_Review'], str):
            sample_review_texts.append(review['Manual_Review'])
        elif 'Review' in review:
            if isinstance(review['Review'], list):
                sample_review_texts.append(" ".join(review['Review']))
            elif isinstance(review['Review'], str):
                # Handle string formats but avoid eval() since it's causing issues
                if review['Review'].startswith('[') and ']' in review['Review']:
                    try:
                        # Safer approach using string manipulation instead of eval
                        review_text = review['Review'].strip('[]').replace("'", "").replace('"', "")
                        words = [word.strip() for word in review_text.split(',')]
                        sample_review_texts.append(" ".join(words))
                    except:
                        sample_review_texts.append(review['Review'])
                else:
                    sample_review_texts.append(review['Review'])
    
    return generated_summary

In [None]:
# ['1302240', '2215430', '1142710', '294100', '646570', '1172620', '413150', '2138710']

In [21]:
train_df.iloc[6]['combined_reviews']



In [25]:
generate_summary_for_specific_game(1142710)

'Total War: Total War: Shogun III is the most complete Total War game ever made. Players will be able to play as Lohkir Fellheart, one of the most powerful warlords in the history of the world. The campaign campaign is where the most intresting things can happen with green middigets with sticks killing tanks and demonds. Players can choose between a single-player campaign or two-player multiplayer where one controls the army and the other the cav  or mages. Players have the option of buying the first and second games for the same price as the base game. The first game and second game will give you loads of extra content to play with. You can also buy extra Lords and'

## Next!
we got the model to create a summary but theyre hard to understand and not well written, also they're from a first person perspective. The model also tends to get the game wrong somehow. I didn't check the game descriptions so i'll look into those and try to better understand the correlation between the review and target(currenntly game desc)