In [123]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [124]:
labeled_data = pd.read_csv('Data_csvs/Validation_Set.csv')
unlabeled_data = pd.read_excel('Data_csvs/Test_Set.xlsx')

vectorizer = CountVectorizer()
classifier = MultinomialNB()

In [125]:
unlabeled_data['Review'] = unlabeled_data['Review'].str.replace(r'[^\w\s]', '', regex=True)
unlabeled_data['Review'] = unlabeled_data['Review'].fillna('')

In [126]:
unlabeled_data

Unnamed: 0,Rec_id,GameId,Positive_Review,Up_Votes,Weighted_Vote_Score,Playtime_At_Review,Total_Playtime,Review
0,187631040,2215430,True,0,0.500000,4803,4803,Gameplay 55\nGraphics 55\nAudio Sound effect...
1,187615563,2215430,True,0,0.500000,2239,2291,A beautiful game with great cinematic elements...
2,187599172,2215430,True,0,0.500000,1051,1051,Fantastic game with a great PVE experience wit...
3,187595198,2215430,False,6,0.468472,1875,1885,There are some major technical issues Ive expe...
4,187595036,2215430,True,0,0.500000,4916,5115,A great game with some glaring flaws In short ...
...,...,...,...,...,...,...,...,...
331,185883768,1142710,True,0,0.500000,9295,14150,Great strategy game plenty of factions with di...
332,185878959,1142710,True,0,0.500000,296,441,Great game just an FYI for future players you ...
333,185874497,1142710,True,0,0.500000,17383,25009,I like the Warhammer fiction novels and love t...
334,185866287,1142710,True,0,0.500000,2246,4713,I recommend this game but also caution that is...


In [127]:
X_text = vectorizer.fit_transform(labeled_data['Review'])

In [128]:

# Combine text features with other features
X_other = labeled_data[['Positive_Review', 'Up_Votes', 'Weighted_Vote_Score',
                        'Playtime_At_Review', 'Total_Playtime']]
X = pd.concat([pd.DataFrame(X_text.toarray()), X_other], axis=1)

# Convert feature names to strings
X.columns = X.columns.astype(str)

# Get the target variable
y = labeled_data['Gameplay_Relevant']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [129]:
# Create and train the classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

In [130]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")


Accuracy: 0.61
Precision: 0.53
Recall: 0.77
F1-score: 0.63


In [131]:
X_unlabeled_text = vectorizer.transform(unlabeled_data['Review'])

X_unlabeled_other = unlabeled_data[['Positive_Review', 'Up_Votes', 'Weighted_Vote_Score',
                                    'Playtime_At_Review', 'Total_Playtime']]
X_unlabeled = pd.concat([pd.DataFrame(X_unlabeled_text.toarray()), X_unlabeled_other], axis=1)

# Convert feature names to strings for the unlabeled data
X_unlabeled.columns = X_unlabeled.columns.astype(str)

# Make predictions on the unlabeled data
y_unlabeled_pred = classifier.predict(X_unlabeled)

# Add the predicted labels to the unlabeled data
unlabeled_data['Predicted_Gameplay_Relevant'] = y_unlabeled_pred

# Save the updated unlabeled data to a CSV file
unlabeled_data.to_csv('Updated_Test_Set.csv', index=False)