In [None]:
import pandas as pd
import numpy as np

In [None]:
import re
from collections import defaultdict, Counter
import math
import pandas as pd
import numpy as np

# To read the csv files
df_train = pd.read_csv('/content/games_train_sheets - games-train.csv')
df_test = pd.read_csv('/content/games_test_sheets - games-test.csv')

# To clean the NAN values
cleaned_train_df = df_train.dropna()
cleaned_test_df = df_test.dropna()

# Another logic to clean text$, handle NaN and remove non-alphabetic characters
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zäöüß ]", "", text)
    return text

# Preprocess train data
cleaned_train_df["Review text"] = cleaned_train_df["Review text"].apply(preprocess_text)
cleaned_train_df["Class of review (good or bad)"] = cleaned_train_df["Class of review (good or bad)"].map({"gut": 1, "bad": 0})

# Preprocess test data
cleaned_test_df["Review text"] = cleaned_test_df["Review text"].apply(preprocess_text)
cleaned_test_df["Class of review (good or bad)"] = cleaned_test_df["Class of review (good or bad)"].map({"gut": 1, "bad": 0})

# Running the code in a subset of actual data (Because it was very slow when tried on the whole dataset)
train_data = cleaned_train_df.head(160)
test_data = cleaned_test_df.head(40)

# Naive Bayes classifier logic from scratch
class NaiveBayesClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_counts = defaultdict(int)
        self.term_counts = defaultdict(lambda: defaultdict(int))
        self.vocab = set()
        self.total_docs = 0

    def train(self, data, text_col, label_col):
        for _, row in data.iterrows():
            label = row[label_col]
            text = row[text_col]
            self.class_counts[label] += 1
            self.total_docs += 1
            words = text.split()
            for word in words:
                self.vocab.add(word)
                self.term_counts[label][word] += 1

    def predict(self, text):
        words = text.split()
        scores = {}
        for c in self.class_counts:
            log_prior = math.log(self.class_counts[c] / self.total_docs)
            log_likelihood = 0
            for word in words:
                word_count = self.term_counts[c][word]
                total_count = sum(self.term_counts[c].values())
                likelihood = (word_count + self.alpha) / (total_count + self.alpha * len(self.vocab))
                log_likelihood += math.log(likelihood)
            scores[c] = log_prior + log_likelihood
        return max(scores, key=scores.get)

    def top_terms(self, top_n=100):
        class_top_terms = {}
        for c in self.class_counts:
            terms = self.term_counts[c]
            total_count = sum(terms.values())
            term_probs = {term: (count + self.alpha) / (total_count + self.alpha * len(self.vocab))
                          for term, count in terms.items()}
            sorted_terms = sorted(term_probs.items(), key=lambda x: x[1], reverse=True)[:top_n]
            class_top_terms[c] = sorted_terms
        return class_top_terms

# To instantiate and train classifier
nb = NaiveBayesClassifier(alpha=1.0)
nb.train(train_data, "Review text", "Class of review (good or bad)")

# Predict on test data
test_data["Predicted"] = test_data["Review text"].apply(nb.predict)

# Output predictions
print("Predictions on Test Data:")
print(test_data)

# To print top terms for each class
print("\nTop Terms for Each Class:")
top_terms = nb.top_terms(top_n=100)
for c, terms in top_terms.items():
    print(f"Class {c}:")
    for term, prob in terms:
        print(f"{term}: {prob:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_train_df["Review text"] = cleaned_train_df["Review text"].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_train_df["Class of review (good or bad)"] = cleaned_train_df["Class of review (good or bad)"].map({"gut": 1, "bad": 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

Predictions on Test Data:
          Title of game  Class of review (good or bad)  \
1          Die Simpsons                            1.0   
2          Die Simpsons                            1.0   
3          Die Simpsons                            1.0   
4        Subway Surfers                            1.0   
6          Die Simpsons                            1.0   
7               Hay Day                            NaN   
8          Die Simpsons                            1.0   
9           Angry Birds                            1.0   
10              Hay Day                            1.0   
11         Die Simpsons                            1.0   
12         Die Simpsons                            1.0   
14          Angry Birds                            1.0   
15       Bike Race Free                            1.0   
16              Hay Day                            1.0   
17       Subway Surfers                            1.0   
18       Subway Surfers                       

In [None]:
test_df = test_data = cleaned_test_df.head(10)



test_df["Class of review"] = test_df["Class of review (good or bad)"].map({"gut": 1, "bad": 0})

# To evaluate predictions
def evaluate_model(model, test_df):
    predictions = []
    for _, row in test_df.iterrows():
        predicted = model.predict(row["Review text"])
        predictions.append(predicted)

    test_df["Predicted"] = predictions

    # Initialize counters
    TP_good, FP_good, FN_good = 0, 0, 0
    TP_bad, FP_bad, FN_bad = 0, 0, 0

    for _, row in test_df.iterrows():
        true_label = row["Class of review (good or bad)"]
        predicted_label = row["Predicted"]

        if true_label == 1:  # True class: 'gut'
            if predicted_label == 1:
                TP_good += 1  # True Positive for 'gut'
            else:
                FN_good += 1  # False Negative for 'gut'
                FP_bad += 1  # False Positive for 'bad'
        else:  # True class: 'bad'
            if predicted_label == 0:
                TP_bad += 1  # True Positive for 'bad'
            else:
                FN_bad += 1  # False Negative for 'bad'
                FP_good += 1  # False Positive for 'gut'

    # Precision, Recall, F1-Score for good or 'gut'
    precision_good = TP_good / (TP_good + FP_good) if TP_good + FP_good > 0 else 0
    recall_good = TP_good / (TP_good + FN_good) if TP_good + FN_good > 0 else 0
    f1_good = (
        2 * precision_good * recall_good / (precision_good + recall_good)
        if precision_good + recall_good > 0
        else 0
    )

    # Precision, Recall, F1-Score for 'bad'
    precision_bad = TP_bad / (TP_bad + FP_bad) if TP_bad + FP_bad > 0 else 0
    recall_bad = TP_bad / (TP_bad + FN_bad) if TP_bad + FN_bad > 0 else 0
    f1_bad = (
        2 * precision_bad * recall_bad / (precision_bad + recall_bad)
        if precision_bad + recall_bad > 0
        else 0
    )

    # To return evaluation metrics and counts
    return {
        "TP_good": TP_good,
        "FP_good": FP_good,
        "FN_good": FN_good,
        "Precision_good": precision_good,
        "Recall_good": recall_good,
        "F1_good": f1_good,
        "TP_bad": TP_bad,
        "FP_bad": FP_bad,
        "FN_bad": FN_bad,
        "Precision_bad": precision_bad,
        "Recall_bad": recall_bad,
        "F1_bad": f1_bad,
        "Wrongly_classified": test_df[test_df["Class of review (good or bad)"] != test_df["Predicted"]],
    }
# Runnig evaluation
results = evaluate_model(nb, test_df)

# Print metrics
print("Evaluation Results:")
print(f"Precision (good): {results['Precision_good']:.2f}")
print(f"Recall (good): {results['Recall_good']:.2f}")
print(f"F1-Score (good): {results['F1_good']:.2f}")
print(f"Precision (bad): {results['Precision_bad']:.2f}")
print(f"Recall (bad): {results['Recall_bad']:.2f}")
print(f"F1-Score (bad): {results['F1_bad']:.2f}")
print(f"TP (good): {results['TP_good']}, FP (good): {results['FP_good']}, FN (good): {results['FN_good']}")
print(f"TP (bad): {results['TP_bad']}, FP (bad): {results['FP_bad']}, FN (bad): {results['FN_bad']}")

# To print wrongly classified instances
print("\nWrongly Classified Instances:")
print(results["Wrongly_classified"])



Evaluation Results:
Precision (good): 0.89
Recall (good): 0.89
F1-Score (good): 0.89
Precision (bad): 0.00
Recall (bad): 0.00
F1-Score (bad): 0.00
TP (good): 8, FP (good): 1, FN (good): 1
TP (bad): 0, FP (bad): 1, FN (bad): 1

Wrongly Classified Instances:
    Title of game  Class of review (good or bad)           Title of review  \
4  Subway Surfers                            1.0                     Great   
7         Hay Day                            NaN  Ansich ganz nett aber...   

                                         Review text  Class of review  \
4  i like the game but near the last update it st...              NaN   
7  ansich ein nettes spiel jedoch warmen alles ha...              NaN   

   Predicted  
4        NaN  
7        1.0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["Class of review"] = test_df["Class of review (good or bad)"].map({"gut": 1, "bad": 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["Predicted"] = predictions
