In [1]:
import pandas as pd
import numpy as np

app_id = 1151640
data_path = 'data/'

positive_reviews = pd.read_csv(data_path + 'review_%d_positive.csv' % app_id)
negative_reviews = pd.read_csv(data_path + 'review_%d_negative.csv' % app_id)

# Unlabeled reviews which we will evaluate - a sample extracted from the most recent reviews
evaluation_data = pd.read_csv(data_path + 'review_%d_sample.csv' % app_id)

summary = pd.read_csv(data_path + 'summary_%d.csv' % app_id)

In [2]:
from enum import Enum
class Sentiment(Enum):
    POSITIVE = 'positive'
    NEGATIVE = 'negative'
    NEUTRAL = 'neutral'

numeric_features = [
    'votes_up',
    'votes_funny',
    'weighted_vote_score',
    'author_num_games_owned',
    'author_playtime_at_review',
]

# Label our datasets
positive_reviews['label'] = Sentiment.POSITIVE
negative_reviews['label'] = Sentiment.NEGATIVE

# Create the batch of training data
training_data = positive_reviews.append(negative_reviews)

# use only numeric features in evaluation as well
evaluation_data = evaluation_data

In [3]:
def euclidean_distance(sx, sy):
    return np.sqrt(np.sum([(a-b)*(a-b) for a, b in zip(sx, sy)]))

In [4]:
def get_neighbours(training_set, evaluated_row, num_neighbours):
    neighbour_data = []  # stores (row, distance) info

    for _, row in training_set.iterrows():
        d = euclidean_distance(
            row[numeric_features],
            evaluated_row[numeric_features]
        )
        neighbour_data.append((row, d))
    
    # sort according to distance
    neighbour_data.sort(key=lambda item: item[1])
    
    return [item[0] for item in neighbour_data[:num_neighbours]]

In [5]:
def predict(training_set, evaluated_row, num_neighbours) -> Sentiment:
    neighbours = get_neighbours(training_set, evaluated_row, num_neighbours)
    labels = [n['label'] for n in neighbours]
    prediction = max(set(labels), key=labels.count)
    
    return prediction

predict(training_data, evaluation_data.loc[0], 4)

<Sentiment.POSITIVE: 'positive'>

In [6]:
def compute_knn(training_set, evaluation_set, k):
    predictions = []
    for index, row in evaluation_set.iterrows():
        predicted_label = predict(training_set, row, k)
        
        # slap the predicted label on
        row['label'] = predicted_label

        predictions.append(row)
    
    return pd.DataFrame.from_records(predictions)

compute_knn(training_data, evaluation_data, 4)

Unnamed: 0.1,Unnamed: 0,recommendationid,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,...,received_for_free,written_during_early_access,author_steamid,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,label
0,0,84442464,english,you get to kill robot dinosaurs what more can...,1610299947,1610299947,True,1,0,0.000000,...,False,False,76561198256498872,39,3,2134,86,2134,1610216290,Sentiment.POSITIVE
1,1,84440902,english,OMG such a great game and some cool graphics!G...,1610298355,1610298355,True,1,0,0.000000,...,False,False,76561198350121370,97,3,550,550,550,1610259585,Sentiment.NEGATIVE
2,2,84440338,english,one of the best and most beautiful games,1610297779,1610297779,True,1,0,0.000000,...,False,False,76561198004451827,201,4,3452,3076,3452,1610291282,Sentiment.POSITIVE
3,3,84439822,english,Just get this game. Enough said,1610297212,1610297212,True,1,0,0.000000,...,False,False,76561198047590636,152,4,1287,1287,1263,1610298727,Sentiment.POSITIVE
4,4,84436981,english,Very good game,1610294234,1610294234,True,1,0,0.000000,...,False,False,76561199093974239,6,3,395,290,395,1610247937,Sentiment.NEGATIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,84318586,english,wtf such a great game i can't even comprehend,1610143872,1610143872,True,1,0,0.000000,...,False,False,76561198077915340,128,18,55,55,19,1610227424,Sentiment.NEGATIVE
96,96,84313874,english,Rough port on launch but developers worked rea...,1610138646,1610138646,True,1,0,0.000000,...,False,False,76561198161665803,63,4,1223,1148,1100,1610171689,Sentiment.NEGATIVE
97,97,84310654,english,Game is really good really fun really addict...,1610135220,1610135220,True,1,0,0.530201,...,False,False,76561198067815217,364,42,2070,2070,1297,1610303417,Sentiment.NEGATIVE
98,98,84309074,english,nice game but how to finish optimiziing shader??,1610133640,1610133640,True,1,0,0.523810,...,False,False,76561198811527795,8,1,987,987,674,1610303583,Sentiment.POSITIVE
