In [8]:
import sys
import random
import matplotlib.pyplot as plt
import numpy as np
import csv

%matplotlib inline

In [9]:
# Given a path to file, load it
# returns data as an array of dictionaries, [{column_name: cell_value}, {...}]

def load_csv(filepath):
    data = []
    with open(filepath) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=",")
        for row in reader:
            data.append(dict(row))
    
    return data

In [10]:
users = load_csv("data/users.csv")
ratings = load_csv("data/ratings_train.csv")

In [11]:
# Helper function to visualize points in 2d space as a scatter plot
#
# x_vals: List<float>
# y_vals: List<float>
# colors: optional array of colors, must be the same length as x_vals and y_vals.
# more info on matplotlib colors: https://matplotlib.org/2.0.2/api/colors_api.html
def scatter_plot(x_vals, y_vals, colors=None):
    fig = plt.figure(figsize=(20,8))
    ax = fig.add_subplot(1, 1, 1)
    ax.scatter(x_vals, y_vals, c=colors)
    ax.grid(True)
    return fig

In [22]:
ratings[0]

{'uid': '1147', 'bid': '288', 'rating': '2'}

In [23]:
# Helper class for working with a dataset. It contains helper methods that format and filter the data
class Data:
    # data: [{column_name: cell_value}, ...]
    def __init__(self, data):
        self.data = data
    
    
    # Converts data values to data_types specified in column_defs
    #
    # data: [{column_name: cell_value}, ...]
    # column_defs: {column_name: data_type}
    # returns [{column_name: converted_cell_value}]
    def convert_data_types(self, data, column_defs):
        result = []
        for row in data:
            new_values = {}
            for name, value in row.items():
                if name in column_defs and column_defs[name] == "float":
                    new_values[name] = float(value)
                elif name in column_defs and column_defs[name] == "integer":
                    new_values[name] = int(value)
                else:
                    new_values[name] = value

            result.append(new_values)

        return result

    # Returns only those columns that are specified in column_defs
    #
    # data: [{column_name: cell_value}, ...]
    # column_defs: {column_name: data_type}
    # returns [{selected_column_name: cell_value}]
    def select_columns(self, data, column_defs):
        result = []
        for row in data:
            filtered_values = {}
            for name, value in row.items():
                if name in column_defs:
                    filtered_values[name] = value

            result.append(filtered_values)

        return result
    
    
    # returns location_x/location_y, which we can then use for KMeans
    def kmeans_data(self):
        column_defs = {"location_x": "float", "location_y": "float"}
        selected_data = self.select_columns(self.data, column_defs)
        return self.convert_data_types(selected_data, column_defs)
    
    
    # returns location_x/location_y data in the format suitable for our visualization function
    #
    # returns (List<float>, List<float>)
    def kmeans_data_for_viz(self):
        data = self.kmeans_data()
        x_vals = [r["location_x"] for r in data]
        y_vals = [r["location_y"] for r in data]
        
        return x_vals, y_vals
    
    def lab2_user_ratings_count_data(self):
        column_defs = {"uid": "integer", "num_reviews": "integer", "r1_cnt": "integer", "r2_cnt": "integer", "r3_cnt": "integer", "r4_cnt": "integer", "r5_cnt": "integer"}
        
        result = []
        for row in data:
            filtered_values = {}
            for name, value in row.items():
                if name in column_defs:
                    filtered_values[name] = value

            result.append(filtered_values)

        return result
    
    def lab2_ratings_data(self):
        column_defs = {"uid": "integer", "rating": "integer"}
        
        result = []
        for row in data:
            filtered_values = {}
            for name, value in row.items():
                if name in column_defs:
                    filtered_values[name] = float(value)

            result.append(filtered_values)

        return result

In [24]:
users_data = Data(users)
user_features = users_data.lab2_user_ratings_count_data()
user_features_dict = {u["uid"]: u for u in user_features}
print(user_features[0])

ratings_data =  Data(ratings)
rating_features = ratings_data.lab2_ratings_data()
print(rating_features[0])

{'uid': '0', 'num_reviews': '295', 'r1_cnt': '29', 'r2_cnt': '74', 'r3_cnt': '119', 'r4_cnt': '54', 'r5_cnt': '19'}
{'uid': 0.0}


In [None]:
def evaluation_hist_bin(num):
    if num >= 0 and num < 1:
        bin_enum = "[0-1]"
    elif num >= 1 and num < 2:
        bin_enum = "[1-2]"
    elif num >= 2 and num < 3:
        bin_enum = "[2-3]"
    elif num >= 3 and num < 4:
        bin_enum = "[3-4]"
    else:
        bin_enum = "[4+]"
        
    return bin_enum


def evaluate_predictions(model_predictions, actual_predictions):
    errors = []
    histogram = {}
    
    for key in expected_predictions.keys():
        diff = abs(model_predictions[key] - actual_predictions[key])
        errors.append(diff**2)
        bin_id = evaluation_hist_bin(diff)
        
        if bin_id in histogram:
            histogram[bin_id] += 1
        else:
            histogram[bin_id] = 1
            
    mse = sum(errors) / len(errors)
    rsme = mse**0.5
    
    return rsme, histogram

In [None]:
x_vals = []
y_vals = []

for u in user_features:
    a = u["num_reviews"]
    counts = [u["r1_cnt"]/n, u["r2_cnt"]/n, u["r3_cnt"]/n, u["r4_cnt"]/n, u["r5_cnt"]/n]
    
    #f1
    tail_sum = counts[0] + counts[4]
    mid_sum = counts[1] + counts[2] + counts[3]
    feature1 = mid_sum - tail_sum
    
    #f2
    mean = (u["r1_cnt"] * 1 + u["r2_cnt"] * 2 + u["r3_cnt"] * 3 + u["r4_cnt"] * 4 + u["r5_cnt"] * 5)
    max_count, mode = max([(val, idx+1) for idx, val in enumerate(counts)])
    
    feature2 = mode = mean
    
    x_vals.append(feature1)
    y_vals.append(feature2)

In [None]:
fig = scatter_plot(x_vals, y_vals)
plt.show()