# Install

```bash
pip install pandas tqdm watermark
```

# Prepare dataset for scoring

In [1]:
import pandas as pd
from watermark import watermark


games = pd.read_csv("C:/Users/amalj/OneDrive/Desktop/Machine learning subjects/Machine learning DM2/H3/games5k.csv")
review = pd.read_csv("C:/Users/amalj/OneDrive/Desktop/Machine learning subjects/Machine learning DM2/H3/review5k.csv")
games.shape, review.shape

((4975, 2), (246946, 2))

In [2]:
# PLEASE DO NOT CHANGE THE CODE IN THIS CELL
review_with_desc = review.merge(games, on=['game'], how='left')
# keep comments with know game description
review_with_desc = review_with_desc[~review_with_desc['desc'].isnull()]
# set new order for column
review_with_desc = review_with_desc[['game', 'comment', 'desc']]
review_with_desc.shape

(121264, 3)

In [3]:
review_with_desc.head()

Unnamed: 0,game,comment,desc
2,Strat-O-Matic College Football,You wan na coach a <UNK> team ? This is your g...,There were two versions of this game - the fir...
3,Quartex,"It 's a fine abstract game , but the tiles cou...","In <UNK> , players take turns placing a tile n..."
4,In Tenebris Lux,I backed this on Kickstarter based solely on t...,ENTER A WORLD OF VICTORIAN HORROR ! In 19th-ce...
6,Give It to the King!,{ O : Robert } Traded Away for Ark of the Cove...,<UNK> It to the <UNK> is a game for 2-4 player...
7,Cthulhu: The Horror in Dunwich,Giving this a 1 as a public service to any pot...,"<UNK> The <UNK> In <UNK> , a standalone expans..."


In [4]:
review_with_desc.sample(10)

Unnamed: 0,game,comment,desc
138175,Neon Gods,<UNK> is a prime example of how obsessive stre...,<UNK> is a story of street gangs set in a kale...
69805,Pro Foto-Football,I played it .5 times ( only half a game ) . It...,According to Sid Sacksons A Gamut of Games : O...
78645,Ducks in Tow,This is a charming game with a really unique t...,Welcome to the <UNK> Sanctuary ! Come enjoy th...
227447,Sorry! Spin,"Not as impressed with this one , the gears are...",The twists and turns never end in this fun gam...
182533,Secrets of the Sea,"After one play with two kids , it seems too lo...",The players are diving for sunken treasure ! E...
193521,Paul Koenig's D-Day: The Canadian Beach,See my review for details <UNK> http <UNK> //w...,Juno <UNK> The <UNK> is the first game in the ...
216776,Ada Lovelace: Consulting Mathematician,PnP,It is the summer of 1842 and the finest <UNK> ...
226715,Wings of Glory: WW2 Rules and Accessories Pack,Own have a couple of planes to play with - get...,<UNK> of <UNK> is a game system that allows pl...
148815,Ace of Aces: Jet Eagles,I recommend the rotary series over the <UNK> s...,"From the makers of <UNK> of <UNK> , the WWI do..."
224565,Kleine Magier,Eva Memory with some twists and really nice co...,Description from the box back : Turmoil in Won...


In [5]:
review_with_desc

Unnamed: 0,game,comment,desc
2,Strat-O-Matic College Football,You wan na coach a <UNK> team ? This is your g...,There were two versions of this game - the fir...
3,Quartex,"It 's a fine abstract game , but the tiles cou...","In <UNK> , players take turns placing a tile n..."
4,In Tenebris Lux,I backed this on Kickstarter based solely on t...,ENTER A WORLD OF VICTORIAN HORROR ! In 19th-ce...
6,Give It to the King!,{ O : Robert } Traded Away for Ark of the Cove...,<UNK> It to the <UNK> is a game for 2-4 player...
7,Cthulhu: The Horror in Dunwich,Giving this a 1 as a public service to any pot...,"<UNK> The <UNK> In <UNK> , a standalone expans..."
...,...,...,...
246934,Bakerspeed,Essen auction list @,Investigate quickly ! LeStrade is already on h...
246935,ARTBOX,"This game seems to be really simple , but in t...",<UNK> is a game in which each player becomes a...
246938,About Time,Relatively basic,Teams compete to guess closest to the year of ...
246939,Taxi!,I simple `` family '' game that will not reall...,Players represent drivers plying for hire on t...


### Text preprocessing

In [6]:
import pandas as pd
import re
import unicodedata
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm
from langdetect import detect

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return "" 
    
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r"<UNK>", "", text)
    text = re.sub(r"&\w+;", "", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\d+", "", text) 

    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return " ".join(lemmatized_tokens)

def detect_and_translate(text):
    try:
        lang = detect(text)
        if lang != 'en':
        
            return "Non-English Text Detected"
        return text
    except:
        return text

def preprocess_pipeline(row):
    row['comment'] = preprocess_text(detect_and_translate(row['comment']))
    row['desc'] = preprocess_text(detect_and_translate(row['desc']))
    return row

tqdm.pandas()
review_with_desc['comment'] = review_with_desc['comment'].progress_apply(lambda x: preprocess_text(detect_and_translate(x)))
review_with_desc['desc'] = review_with_desc['desc'].progress_apply(lambda x: preprocess_text(detect_and_translate(x)))


[nltk_data] Downloading package punkt to C:\Users\amalj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\amalj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amalj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/121264 [00:00<?, ?it/s]

  0%|          | 0/121264 [00:00<?, ?it/s]

In [7]:
import pandas as pd

def count_non_text_blank_rows(df, column_name):
    non_text_blank_rows = df[df[column_name].apply(lambda x: not isinstance(x, str) or x.strip() == "")]
    return len(non_text_blank_rows)

# Example usage
non_text_blank_count = count_non_text_blank_rows(review_with_desc, 'comment')
print(f"Number of non-text blank rows in 'comment' column: {non_text_blank_count}")


Number of non-text blank rows in 'comment' column: 1278


In [8]:
review_with_desc.shape

(121264, 3)

In [24]:
review_with_desc['comment'] = review_with_desc['comment'].fillna("There is no comment for this game.")
review_with_desc['comment'] = review_with_desc['comment'].replace("", "There is no comment for this game.")

In [None]:
import pandas as pd
def count_non_text_blank_rows(df, column_name):
    non_text_blank_rows = df[df[column_name].apply(lambda x: not isinstance(x, str) or x.strip() == "")]
    return len(non_text_blank_rows)

non_text_blank_count = count_non_text_blank_rows(review_with_desc, 'comment')
print(f"Number of non-text blank rows in 'comment' column: {non_text_blank_count}")


Number of non-text blank rows in 'comment' column: 0


# Using Transformers

### Using BERT (bert-base-uncased)

In [17]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

tqdm.pandas(desc="Processing BERT Embeddings")

review_with_desc['desc_embedding'] = review_with_desc['desc'].progress_apply(lambda x: get_bert_embeddings(x) if isinstance(x, str) else np.zeros(768))
review_with_desc['comment_embedding'] = review_with_desc['comment'].progress_apply(lambda x: get_bert_embeddings(x) if isinstance(x, str) else np.zeros(768))

Processing BERT Embeddings:   0%|          | 0/121264 [00:00<?, ?it/s]

Processing BERT Embeddings:   0%|          | 0/121264 [00:00<?, ?it/s]

# Your Algorithm for calculate distance

### FInding the Distances 

#### BERT (bert-base-uncased)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import numpy as np

def calculate_distances(df, embedding_column1, embedding_column2):
    distances = []

    for emb1, emb2 in tqdm(df[[embedding_column1, embedding_column2]].values, total=len(df), desc="Calculating distances"):
        cosine_sim = cosine_similarity([emb1], [emb2])[0][0]
        
        distance = 1 - cosine_sim
        distance = min(1, max(0, distance))
        
        distances.append(distance)

    return distances
review_with_desc['distance'] = calculate_distances(review_with_desc, 'comment_embedding', 'desc_embedding')

Calculating distances:   0%|          | 0/121264 [00:00<?, ?it/s]

Calculating distances: 100%|██████████| 121264/121264 [00:31<00:00, 3910.98it/s]


# Submission

In [23]:
from watermark import watermark

submission = review_with_desc[['distance']]

if submission.shape != (121264, 1):
    raise ValueError(f"Submission shape must be (121264, 1). {submission.shape} are given")
    
if submission['distance'].min() < 0. or submission['distance'].max() > 1.0:
    raise ValueError(f"distance must be from 0. to 1.")
    
print(watermark())
# IMPORTANT
# index must be set TRUE
submission.to_csv("C:/Users/amalj/OneDrive/Desktop/Machine learning subjects/Machine learning DM2/H3/3-Nirmal_Joy.csv", index=True)

Last updated: 2024-12-12T23:51:58.038665+03:00

Python implementation: CPython
Python version       : 3.12.7
IPython version      : 8.27.0

Compiler    : MSC v.1929 64 bit (AMD64)
OS          : Windows
Release     : 11
Machine     : AMD64
Processor   : Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
CPU cores   : 12
Architecture: 64bit



### Thanks 
#### Nirmal Joy