In [None]:
!wget https://raw.githubusercontent.com/Tamis55/VTP_gala_darbs_atsaukmju_klasific-ana_tt22019/refs/heads/main/reviews_with_genre.csv
!wget https://raw.githubusercontent.com/Tamis55/VTP_gala_darbs_atsaukmju_klasific-ana_tt22019/refs/heads/main/genre_classifier.pkl
!wget https://raw.githubusercontent.com/Tamis55/VTP_gala_darbs_atsaukmju_klasific-ana_tt22019/refs/heads/main/tfidf_vectorizer.pkl

In [135]:
import pandas as pd
import re
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import multiprocessing

In [136]:
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [137]:
from collections import defaultdict

GENRE_GROUPS = {
    "Action": [
        "Shooter", "First-Person", "Third-Person", "Platformer", "Horror",
        "Arcade", "Scrolling", "2D", "Military", "Modern"
    ],
    "Role-Playing": [
        "PC-style RPG", "Console-style RPG", "Massively Multiplayer Online",
        "Action RPG", "Japanese-Style", "Traditional", "Card Battle",
        "Massively Multiplayer", "MMI", "MMII", "Turn-Based", "Fantasy"
    ],
    "Sports": [
        "Fighting", "Ice Hockey", "GI / Street", "Dancing",
        "Alternative", "Street"
    ],
    "Racing": ["Driving", "Racing"],
    "Simulation": [
        "Breeding/Constructing", "City Building", "Tycoon", "Government",
        "Educationent", "Real-Time", "Sim", "Flight"
    ],
    "Adventure": ["Puzzle", "Historic"],
    "Strategy": ["General", "No info", "Compilation", "Other"],
    "Miscellaneous": ["Party", "Music", "Tycoon", "Card Battle", "Dancing"]
}

In [138]:
REVERSE_MAP = {}
BROAD_GENRES = set(GENRE_GROUPS.keys())
for broad, subs in GENRE_GROUPS.items():
    for sub in subs:
        REVERSE_MAP[sub.lower()] = broad

def map_to_broad_genre(genre):
    """Map detailed genre to broad category"""
    if not isinstance(genre, str):
        return "Miscellaneous"

    genre_lower = genre.lower().strip()


    if genre_lower in (g.lower() for g in BROAD_GENRES):
        return genre

    if genre_lower in REVERSE_MAP:
        return REVERSE_MAP[genre_lower]


    return "Miscellaneous"

In [139]:
def normalize_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""

    try:

        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())


        tokens = [word for word in text.split()
                 if len(word) > 2 and word not in stop_words]


        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        return " ".join(tokens)
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {str(e)}")
        return ""

In [140]:
def get_top_predictions(model, vectorizer, text, top_n=3):
    clean_text = normalize_text(text)
    if not clean_text:
        return []

    vec = vectorizer.transform([clean_text])

    probabilities = model.predict_proba(vec)[0]
    sorted_indices = np.argsort(probabilities)[::-1]

    top_predictions = [
        (model.classes_[idx], probabilities[idx])
        for idx in sorted_indices[:top_n]
    ]

    return top_predictions

In [141]:
INPUT_FILE = 'reviews_with_genre.csv'
TOTAL_SAMPLE = 320000
TRAIN_SIZE = 256000
TEST_SIZE = 64000
CHUNK_SIZE = 25000
MAX_FEATURES = 35000

In [142]:
print("Counting total rows...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    total_rows = sum(1 for _ in f) - 1

print(f"Total rows: {total_rows:,}")

Counting total rows...
Total rows: 321,248


In [143]:
np.random.seed(42)
sample_indices = set(np.random.choice(total_rows, TOTAL_SAMPLE, replace=False))


In [144]:
print("\nCollecting sample data...")
chunks = pd.read_csv(INPUT_FILE, chunksize=CHUNK_SIZE, usecols=['Review', 'genre'])
sample_data = []
processed_chunks = 0

for i, chunk in enumerate(chunks):

    if i == 0:
        chunk = chunk.iloc[1:]


    start_idx = i * CHUNK_SIZE
    end_idx = start_idx + len(chunk)
    chunk_indices = range(start_idx, end_idx)


    mask = [idx in sample_indices for idx in chunk_indices]

    selected_chunk = chunk[mask].copy()
    if not selected_chunk.empty:
        sample_data.append(selected_chunk)
        processed_chunks += 1
        print(f"Chunk {i+1}: Collected {len(selected_chunk)} rows")

    if len(sample_data) > 0 and sum(len(df) for df in sample_data) >= TOTAL_SAMPLE:
        print("Reached sample size target")
        break



Collecting sample data...
Chunk 1: Collected 24888 rows
Chunk 2: Collected 24900 rows
Chunk 3: Collected 24907 rows
Chunk 4: Collected 24913 rows
Chunk 5: Collected 24896 rows
Chunk 6: Collected 24880 rows
Chunk 7: Collected 24913 rows
Chunk 8: Collected 24898 rows
Chunk 9: Collected 24897 rows
Chunk 10: Collected 24922 rows
Chunk 11: Collected 24903 rows
Chunk 12: Collected 24918 rows
Chunk 13: Collected 21164 rows


In [145]:
df_sample = pd.concat(sample_data).sample(frac=1, random_state=42)[:TOTAL_SAMPLE]
print(f"\nFinal sample size: {len(df_sample):,}")



Final sample size: 319,999


In [146]:
df_sample['primary_genre'] = df_sample['genre'].str.split(',').str[0].str.strip()
df_sample = df_sample.dropna(subset=['primary_genre'])
df_sample = df_sample[df_sample['primary_genre'] != '']
df_sample['primary_genre'] = df_sample['primary_genre'].astype(str)
df_sample['broad_genre'] = df_sample['primary_genre'].apply(map_to_broad_genre)

In [147]:
df_sample = df_sample.dropna(subset=['broad_genre'])
df_sample = df_sample[df_sample['broad_genre'] != '']
df_sample['broad_genre'] = df_sample['broad_genre'].astype(str)

In [148]:
print("\nNormalizing text...")
df_sample['clean_review'] = df_sample['Review'].apply(normalize_text)
df_sample = df_sample[df_sample['clean_review'].str.len() > 0]
df_sample['clean_review'] = df_sample['clean_review'].astype(str)


Normalizing text...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['clean_review'] = df_sample['clean_review'].astype(str)


In [149]:
df_sample = df_sample.iloc[:TOTAL_SAMPLE]
print(f"\nFinal cleaned sample size: {len(df_sample):,}")
print("Genre distribution:\n", df_sample['primary_genre'].value_counts(normalize=True))


Final cleaned sample size: 270,169
Genre distribution:
 primary_genre
Action                          0.330993
Role-Playing                    0.122386
Action Adventure                0.121183
Strategy                        0.089111
Sports                          0.073798
Miscellaneous                   0.067698
Adventure                       0.048925
Driving                         0.047356
Simulation                      0.032024
General                         0.014794
Racing                          0.011012
Puzzle                          0.006936
Sci-Fi                          0.005071
Fantasy                         0.004545
Action RPG                      0.003753
Modern                          0.003457
Shooter                         0.003335
Horror                          0.001932
Historic                        0.001451
Platformer                      0.001107
PC-style RPG                    0.000962
GT / Street                     0.000840
First-Person               

In [150]:

print("\nOriginal genre distribution:")
print(df_sample['primary_genre'].value_counts(normalize=True).head(10))


Original genre distribution:
primary_genre
Action              0.330993
Role-Playing        0.122386
Action Adventure    0.121183
Strategy            0.089111
Sports              0.073798
Miscellaneous       0.067698
Adventure           0.048925
Driving             0.047356
Simulation          0.032024
General             0.014794
Name: proportion, dtype: float64


In [151]:
print("\nBroad genre distribution:")
broad_dist = df_sample['broad_genre'].value_counts(normalize=True)
print(broad_dist)
print(f"\nNumber of classes: {len(broad_dist)}")


Broad genre distribution:
broad_genre
Action           0.342726
Miscellaneous    0.196488
Role-Playing     0.133653
Strategy         0.104642
Sports           0.074161
Racing           0.058367
Adventure        0.057312
Simulation       0.032650
Name: proportion, dtype: float64

Number of classes: 8


In [152]:

df_train = df_sample.iloc[:TRAIN_SIZE]
df_test = df_sample.iloc[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]


X_train = df_train['clean_review']
y_train = df_train['broad_genre']
X_test = df_test['clean_review']
y_test = df_test['broad_genre']

In [153]:
print("\nVectorizing text...")
vectorizer = TfidfVectorizer(
    max_features=MAX_FEATURES,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.7
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


Vectorizing text...


In [154]:
print("Training model...")
model = MultinomialNB()
model.fit(X_train_vec, y_train)


Training model...


In [155]:
print("\nEvaluating model...")
y_pred = model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Evaluating model...
Accuracy: 0.63

Classification Report:
               precision    recall  f1-score   support

       Action       0.53      0.94      0.68      4898
    Adventure       0.87      0.25      0.39       836
Miscellaneous       0.66      0.49      0.56      2719
       Racing       0.94      0.49      0.64       794
 Role-Playing       0.83      0.59      0.69      1892
   Simulation       0.91      0.09      0.17       515
       Sports       0.92      0.56      0.70      1031
     Strategy       0.83      0.42      0.55      1484

     accuracy                           0.63     14169
    macro avg       0.81      0.48      0.55     14169
 weighted avg       0.71      0.63      0.61     14169



In [156]:
print("\nSaving model...")
joblib.dump(model, 'genre_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Model saved successfully!")


Saving model...
Model saved successfully!


In [157]:
model = joblib.load('genre_classifier.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [158]:
test_review = "It’s a story worthy of a place in the more accepted subculture of dark fantasy ruled across media by Game of Thrones. First-timers will easily love this facet but may also be surprised to learn that this series, and the books it’s based upon, have been the at the fore of adult and mature storytelling for a long time. Wild Hunt is both at times brutal and sexy, with a juxtaposition of hard-edged steel (or silver), blood and death being met with soft, naked skin; passion, lust and even love."
print("\nTesting top 3 predictions:")
top_preds = get_top_predictions(model, vectorizer, test_review, top_n=3)


genres_only = [pred[0] for pred in top_preds]
print(f"Top 3 genres: {genres_only}")
print("With probabilities:")
for genre, prob in top_preds:
    print(f"{genre}: {prob:.4f}")


print("\nEvaluating top-3 accuracy on test set...")
top3_correct = 0


X_test_vec = vectorizer.transform(df_test['clean_review'])
probabilities = model.predict_proba(X_test_vec)

for i in range(len(df_test)):
    actual_genre = y_test.iloc[i]
    probs = probabilities[i]
    sorted_indices = np.argsort(probs)[::-1]
    top_genres = [model.classes_[idx] for idx in sorted_indices[:3]]

    if actual_genre in top_genres:
        top3_correct += 1

print(f"Top-3 Accuracy: {top3_correct/len(df_test):.2f}")


Testing top 3 predictions:
Top 3 genres: [np.str_('Role-Playing'), np.str_('Action'), np.str_('Miscellaneous')]
With probabilities:
Role-Playing: 0.5744
Action: 0.1543
Miscellaneous: 0.1192

Evaluating top-3 accuracy on test set...
Top-3 Accuracy: 0.89


In [159]:
def predict_top3_genres(text):
    """Predict top 3 genres for new text, sorted by probability (highest first)"""

    clean_text = normalize_text(text)
    if not clean_text:
        return ["Unknown", "Unknown", "Unknown"]


    vec = vectorizer.transform([clean_text])

    probabilities = model.predict_proba(vec)[0]


    sorted_indices = np.argsort(probabilities)[::-1]

    return [model.classes_[idx] for idx in sorted_indices[:3]]

In [160]:
print("\nFinal prediction function example:")
test_text = "Forza Horizon 5 is a visual spectacle and a masterpiece, providing plenty to do and tons of fun to be had around every corner."
top3 = predict_top3_genres(test_text)
print(f"Input: '{test_text[:30]}...'")
print(f"Top 3 genres: {top3}")


Final prediction function example:
Input: 'Forza Horizon 5 is a visual sp...'
Top 3 genres: [np.str_('Racing'), np.str_('Miscellaneous'), np.str_('Action')]


In [161]:
test_reviews = [
    "Epic fantasy adventure with swords and magic battles",
    "Fast-paced shooting action in futuristic cityscape",
    "Solve intricate puzzles in mysterious ancient ruins",
    "Racing through neon-lit streets at 200mph",
    "Building civilizations from scratch through the ages",
    "Rematch has good bones. It’s full of energy, raw and unbridled. But it’s a rough diamond, and rougher than most. The game has serious issues that border on rendering the game completely unfun. But if players stick around for long enough for developer Sloclap to polish those defects, to improve the servers and make goalkeeping feel responsive, then we might just have something here. At the moment, it’s a bit of a mess. But it’s a mess that keeps you coming back, that forces you to play ‘one more match’ in case it’s the one where you finally net a hat trick of bicycle kicks. And it’s a game I’d prefer to play over EA FC any day.",
    "REMATCH is the ultimate arcade soccer game out there when it works. With network issues and game-breaking bugs, Sloclap has to resolve these issues before they can score the golden goal.",
    "Rematch stands as a rare and bold attempt to faithfully translate real-world soccer (or football) into the online competitive gaming space—and it pulls it off really well. Though it can feel rather punishing to learn, this competitive game is there for anyone looking to experience the thrill of a real-world sport in an online space.",
    "It’s easy to dismiss EA Sports FC 25 as a glorified reskin of EA Sports FC 24 – and in many respects, it is – but the customarily small tweaks made to graphics and gameplay add up to a near-perfect sports game experience. This is total football."
]

In [162]:
print("\nTest predictions:")
for review in test_reviews:
    clean_review = normalize_text(review)
    if clean_review:
        vectorized = vectorizer.transform([clean_review])
        prediction = model.predict(vectorized)[0]
        print(f"'{review[:50]}...' → {prediction}")
    else:
        print(f"'{review[:50]}...' → [EMPTY AFTER PROCESSING]")


Test predictions:
'Epic fantasy adventure with swords and magic battl...' → Role-Playing
'Fast-paced shooting action in futuristic cityscape...' → Action
'Solve intricate puzzles in mysterious ancient ruin...' → Adventure
'Racing through neon-lit streets at 200mph...' → Racing
'Building civilizations from scratch through the ag...' → Strategy
'Rematch has good bones. It’s full of energy, raw a...' → Action
'REMATCH is the ultimate arcade soccer game out the...' → Sports
'Rematch stands as a rare and bold attempt to faith...' → Action
'It’s easy to dismiss EA Sports FC 25 as a glorifie...' → Sports


In [163]:
def test_manual_review():
    """Test the model with manually entered reviews"""
    while True:
        print("\n" + "="*50)
        review = input("Enter a game review (or type 'exit' to quit): ")

        if review.lower() == 'exit':
            print("Exiting manual testing.")
            break

        top_preds = get_top_predictions(model, vectorizer, review, top_n=3)

        print("\nTop 3 predicted genres:")
        genres_only = [pred[0] for pred in top_preds]
        print(f"1. {genres_only[0]} (Most likely)")
        print(f"2. {genres_only[1]}")
        print(f"3. {genres_only[2]} (Least likely of top 3)")

        print("\nWith probabilities:")
        for genre, prob in top_preds:
            print(f"{genre}: {prob:.3f}")

        print("\n" + "-"*50)

print("results: ")
test_manual_review()

results: 

Enter a game review (or type 'exit' to quit): exit
Exiting manual testing.
