# Project 2, Recommender System
## 1. Preparation
#### First we import everything we need.

In [1]:
%%time
# IMPORT STUFF
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import itertools
import json
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from typing import Callable
import os

CPU times: user 5.62 s, sys: 1.6 s, total: 7.22 s
Wall time: 13.6 s


#### Then we define the paths to the needed input- and output-files.

In [2]:
%%time
# DIRECTORIES
MAIN_DIR = '/kaggle'
INPUT_DIR = MAIN_DIR + '/input'
WORKING_DIR = MAIN_DIR + '/working'
DATA_DIR = INPUT_DIR + '/dis-project-2-recommender-systems'

# DATA FILES
LINKS = DATA_DIR + '/links.csv'
MOVIES = DATA_DIR + '/movies.csv'
TAGS = DATA_DIR + '/tags.csv'
TEST_DATA = DATA_DIR + '/test_set_no_ratings.csv'
TRAIN_DATA = DATA_DIR + '/train_ratings.csv'

# PREPROCESSED AND SCRAPED FILES
PCC_INPUT = INPUT_DIR + '/e7cc12ec0019932c50bc4eab7c9c16463efed46a091e2dacb9/pcc.csv/pcc.csv'
TMDB_DATA = INPUT_DIR + '/e7cc12ec0019932c50bc4eab7c9c16463efed46a091e2dacb9/scraped.txt/scraped.txt'

# SAMPLE SOLUTION
SOLUTIONS = INPUT_DIR + '/e7cc12ec0019932c50bc4eab7c9c16463efed46a091e2dacb9/solutions.xls'

# OUTPUT FILES
PCC = WORKING_DIR + '/pcc.csv'
SUBMISSION = WORKING_DIR + '/submission.csv'

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 14.1 µs


#### In this section we load / read all the from the input files into variables

In [3]:
%%time
# READ DATA FROM INPUT FILES
link_data = pd.read_csv(LINKS)
movie_raw = pd.read_csv(MOVIES)
tag_data = pd.read_csv(TAGS)
test_data = pd.read_csv(TEST_DATA)
train_raw = pd.read_csv(TRAIN_DATA)
pcc_data = pd.read_csv(PCC_INPUT)
with open(TMDB_DATA, 'r') as tmdb_file:
    tmdb_json = [json.loads(entry) for entry in tmdb_file.read().split('\n')[:-1]]
solution_raw = pd.read_csv(SOLUTIONS)

CPU times: user 605 ms, sys: 189 ms, total: 795 ms
Wall time: 979 ms


#### We then create different lists and dictionaries that will later be used for more efficient data-lookup.

In [4]:
%%time
# CREATE DIFFERENT WAYS TO LOOK-UP DATA
# train-data
train_data = []
for _, row in train_raw.iterrows():
    train_data.append((int(row['userId']), int(row['movieId']), float(row['rating']), int(row['timestamp'])))

# solution-data
solution_data = []
for _, row in solution_raw.iterrows():
    solution_data.append((int(row['userId']), int(row['movieId']), float(row['rating']), -1))
    
# movie-data (title, year and genres) for each movie-id
movie_data = dict()
for _, movie in movie_raw.iterrows():
    mid = int(movie['movieId'])
    try:
        year = int(movie['title'].split()[-1][1:-1])
    except:
        year = 1990 # This happens 13 times out of the 9742 movies. For those we just assume 1990
    title = ' '.join(movie['title'].split()[:-1])
    genres = movie['genres'].split('|')
    movie_data[mid] = {'title': title, 'year': year, 'genres': genres}

# rating loop-up for a specific movie-id and user-id
# TODO: replace with "user_rating_db"
ratings_per_mid_uid = dict()
for uid, mid, rating, _ in train_data:
    if mid not in ratings_per_mid_uid:
        ratings_per_mid_uid[mid] = dict()
    ratings_per_mid_uid[mid][uid] = rating

# ratings-per-movie
ratings_per_movie = dict()
for _, mid, rating, _ in train_data:
    if mid not in ratings_per_movie:
        ratings_per_movie[mid] = []
    ratings_per_movie[mid].append(rating)

# average rating per movie
avg_ratings = dict()
for mid in ratings_per_movie:
    avg_ratings[mid] = sum(ratings_per_movie[mid]) / len(ratings_per_movie[mid])

# for each user and each genre a list of ratings he/she has given
user_genre_ratings = dict()
for uid, mid, rating, _ in train_data:
    if uid not in user_genre_ratings:
        user_genre_ratings[uid] = dict()

    for genre in movie_data[mid]['genres']:
        if genre not in user_genre_ratings[uid]:
            user_genre_ratings[uid][genre] = []
        user_genre_ratings[uid][genre].append(rating)

# for each user the ratings he/she has given
rat_per_user = dict()
for uid, _, rating, _ in train_data:
    if uid not in rat_per_user:
        rat_per_user[uid] = []
    rat_per_user[uid].append(rating)

# for each user the average rating he/she has given
avg_rat_per_user = dict()
for uid in rat_per_user:
    avg_rat_per_user[uid] = sum(rat_per_user[uid]) / len(rat_per_user[uid])

# all existing ratings for users and movies
user_rating_db = dict()
for uid, mid, rating, _ in train_data:
    if uid not in user_rating_db:
        user_rating_db[uid] = dict()
    user_rating_db[uid][mid] = rating

# all users
users = set()
for _, row in train_raw.iterrows():
    users.add(int(row['userId']))
users = list(users)

# all movies
movies = []
for mid in movie_data:
    movies.append(mid)

# tags database
tags = dict()
for _, row in tag_data.iterrows():
    uid, mid, tag = int(row['userId']), int(row['movieId']), row['tag']
    if uid not in tags:
        tags[uid] = dict()
    if mid not in tags[uid]:
        tags[uid][mid] = ""
    tags[uid][mid] += " " + tag

# movie tags
movie_tags = dict()
for _, row in tag_data.iterrows():
    mid, tag = int(row['movieId']), row['tag']
    if mid not in movie_tags:
        movie_tags[mid] = ""
    movie_tags[mid] += " " + tag

# user tags
user_tags = dict()
for _, row in tag_data.iterrows():
    uid, tag = int(row['userId']), row['tag']
    if uid not in user_tags:
        user_tags[uid] = ""
    user_tags[uid] += " " + tag

CPU times: user 13.6 s, sys: 24.6 ms, total: 13.6 s
Wall time: 13.7 s


#### Here we define general helper-methods.

In [5]:
%%time
# round a float to the nearest half-star
def half_star(x: float) -> float:
    return round(x*2)/2

# calculate the RSME between two lists
def RSME(calculated: list[float], actual: list[float]):
    assert len(calculated) == len(actual)
    return math.sqrt(sum((c - a)**2 for c,a in zip(calculated, actual))/len(actual))

# test a specific rate-function on some data
def test_rate_on_data(rate: Callable[[int, int], float], data: list[int, int, float, int], max_iter: int) -> float:
    if max_iter < 0:
        max_iter = len(data)
    calculated = []
    actual = []
    for i, (uid, mid, rating, _) in enumerate(data):
        calculated.append(half_star(rate(uid, mid)))
        actual.append(rating)
        if i >= max_iter:
            break
    rsme = RSME(calculated, actual)
    print(f"RSME: {rsme}")
    return rsme

# test a specific rate-function on the training data
def test_rate_on_train_data(rate: Callable[[int, int], float], max_iter: int) -> float:
    return test_rate_on_data(rate, train_data, max_iter)

# test a specific rate-function on the sample solution
def test_rate_on_solution(rate: Callable[[int, int], float], max_iter) -> float:
    return test_rate_on_data(rate, solution_data, max_iter)

# create a submission using a specific rate-function
def generate_submission(rate: Callable[[int, int], float]) -> None:
    f = open(SUBMISSION, 'w')
    f.write("Id,rating\n")
    for i, row in test_data.iterrows():
        rating = half_star(rate(int(row['userId']), int(row['movieId'])))
        f.write(f"{row['Id']},{rating}\n")
    f.close()

CPU times: user 96 µs, sys: 0 ns, total: 96 µs
Wall time: 101 µs


## 2. Preprocessing
#### First we compute the cosine-similarities between all user-tags

In [6]:
%%time
def get_user_tag_similarities():
    # TF-IDF vectorization and cosine similarity calculation between all user-pairs (given their tags)
    uid_list = dict()
    documents = []
    for i, uid in enumerate(user_tags):
        documents.append(user_tags[uid])
        uid_list[uid] = i

    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
    features = tf.fit_transform(documents)

    cosine_sims = linear_kernel(tf.transform(documents), features)
    return uid_list, cosine_sims

tag_uid_list, tag_cosine_sims = get_user_tag_similarities()

CPU times: user 26.7 ms, sys: 0 ns, total: 26.7 ms
Wall time: 36.9 ms


#### Then we calculate the Pearson Correlation Coefficient between all users

In [7]:
%%time
# pearson correlation coefficient between two users
def pcc(u1, u2):
    nominator = 0
    denominator1 = 0
    denominator2 = 0
    for mid in ratings_per_movie:
        if mid not in ratings_per_movie:
            print("WHAT?")
            return 0
        avg_rat1 = 2.5
        if u1 in avg_rat_per_user:
            avg_rat1 = avg_rat_per_user[u1]
        avg_rat2 = 2.5
        if u2 in avg_rat_per_user:
            avg_rat2 = avg_rat_per_user[u2]

        if u1 in ratings_per_movie[mid]:
            diff1 = ratings_per_movie[mid][u1] - avg_rat1
            denominator1 += diff1**2
        if u2 in ratings_per_movie[mid]:
            diff2 = ratings_per_movie[mid][u2] - avg_rat2
            denominator2 += diff2**2
        if u1 in ratings_per_movie[mid] and u2 in ratings_per_movie[mid]:
            diff1 = ratings_per_movie[mid][u1] - avg_rat1
            diff2 = ratings_per_movie[mid][u2] - avg_rat2
            nominator += diff1 * diff2
    denominator1 = math.sqrt(denominator1)
    denominator2 = math.sqrt(denominator2)
    if denominator1 == 0 or denominator2 == 0:
        return 0
    return nominator / (denominator1 * denominator2)

def tagsim(u1, u2):
    try:
        idx1 = tag_uid_list[u1]
        idx2 = tag_uid_list[u2]
        return tag_cosine_sims[idx1][idx2]
    except:
        pass
    return -1

# calculate pcc for all user-user pairs
def all_pccs(users):
    pccs = dict()
    for u in users:
        pccs[u] = dict()
    print(len(users))
    for i, (u1, u2) in enumerate(itertools.product(users, repeat=2)):
        if (i + 1) % 1000 == 0:
            print(i)
        sim = pcc(u1, u2)
        #tsim = tagsim(u1, u2)
        #if tsim != -1:
        #    sim += tagsim(u1, u2)
        #    sim /= 2
        pccs[u1][u2] = sim
        pccs[u2][u1] = sim
    return pccs

# load all user ids that appear in the train-data
def get_all_users():
    users = set()
    for uid, _, _, _ in train_data:
        users.add(uid)
    return users 

# Calculate Pearson Correlation Coefficients and save them to a file
# Note: this takes around 50min
def calculate_pccs():
    pccs = all_pccs(users)
    
    f = open(PCC, 'w')
    for u1 in pccs:
        for u2 in pccs[u1]:
            f.write(f"{u1},{u2},{pccs[u1][u2]}\n")
    f.close()

# Load Pearson Correlation Coefficients from file
def load_pccs():
    pcc_raw = pd.read_csv(PCC_INPUT)
    pcc_data = dict()
    for _, pcc in pcc_raw.iterrows():
        u1, u2, sim = int(pcc['u1']), int(pcc['u2']), float(pcc['sim'])
        if u1 not in pcc_data:
            pcc_data[u1] = dict()
        if u2 not in pcc_data:
            pcc_data[u2] = dict()
        pcc_data[u1][u2] = sim
        pcc_data[u2][u1] = sim
    return pcc_data

def get_pccs():
    if not os.path.exists(PCC_INPUT):
        calculate_pccs()
    return load_pccs()

pcc_data = get_pccs()

CPU times: user 24.5 s, sys: 43.8 ms, total: 24.6 s
Wall time: 24.6 s


#### We also tokenize and stemm the data scraped from TMDB

In [8]:
# TODO: remove this as it is currently not needed
"""%%time
def get_tmdb_data():
    stemmer = PorterStemmer() 
    trans = str.maketrans(string.punctuation, " " * len(string.punctuation))

    tmdb_data = dict()
    for i, jsn in enumerate(tmdb_json):
        if jsn != {} and 'overview' in jsn:
            text = jsn['overview'].strip().translate(trans)
            tokens = nltk.word_tokenize(text)
            stemmed = [stemmer.stem(word.lower()) for word in tokens if word not in stopwords.words('english')]        
            tmdb_data[jsn['id']] = stemmed
    return tmdb_data

tmdb_data = get_tmdb_data()"""

'%%time\ndef get_tmdb_data():\n    stemmer = PorterStemmer() \n    trans = str.maketrans(string.punctuation, " " * len(string.punctuation))\n\n    tmdb_data = dict()\n    for i, jsn in enumerate(tmdb_json):\n        if jsn != {} and \'overview\' in jsn:\n            text = jsn[\'overview\'].strip().translate(trans)\n            tokens = nltk.word_tokenize(text)\n            stemmed = [stemmer.stem(word.lower()) for word in tokens if word not in stopwords.words(\'english\')]        \n            tmdb_data[jsn[\'id\']] = stemmed\n    return tmdb_data\n\ntmdb_data = get_tmdb_data()'

#### Lastly we compute the cosine-similarities between all movies

In [9]:
%%time
def get_movie_similarities():
    # TF-IDF vectorization and cosine similarity calculation between all document pairs
    raw_overviews = dict()
    for i, jsn in enumerate(tmdb_json):
        if jsn != {} and 'overview' in jsn:      
            raw_overviews[jsn['id']] = jsn['overview']        

    mid_list = dict()
    documents = []
    for i, row in link_data.iterrows():
        mid = int(row['movieId'])
        overview = ""
        try:
            overview = raw_overviews[int(row['tmdbId'])]
        except:
            pass
        # add all tags associated to this movie
        if mid in movie_tags:
            overview += " " + movie_tags[mid]
        documents.append(overview)
        mid_list[mid] = i

    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,5), min_df = 1, stop_words = 'english')
    features = tf.fit_transform(documents)

    cosine_sims = linear_kernel(tf.transform(documents), features)
    return mid_list, cosine_sims

mid_list, cosine_sims = get_movie_similarities()

CPU times: user 10.4 s, sys: 1.35 s, total: 11.8 s
Wall time: 11.8 s


## 3. Define Different Rating Methods
### 3.1 Rate everything 3.5 Stars

In [10]:
%%time
# Here we simply return 3.5 stars, no matter what
def rate_35(uid, mid):
    return 3.5

# test_rate_on_train_data(rate_35, 1000)
# test_rate_on_solution(rate_35, -1)
#generate_submission(rate_35)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 9.3 µs


### 3.2 Movie average-rating

In [11]:
%%time
# Here we simply return the average rating given to a specific movie
def rate_movie_average(uid, mid):
    if mid not in avg_ratings:
        return 3.5
    return avg_ratings[mid]

# test_rate_on_train_data(rate_movie_average, 1000)
# test_rate_on_solution(rate_movie_average, -1)
#generate_submission(rate_movie_average)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10 µs


### 3.3 User average-rating

In [12]:
%%time
# Here we simply return the average rating a user has given in the past
def rate_user_average(uid, mid):
    return avg_rat_per_user[uid]

# test_rate_on_train_data(rate_user_average, 1000)
# test_rate_on_solution(rate_user_average, -1)
# generate_submission(rate_user_average)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.78 µs


### 3.4 Average between 3.2 and 3.3

In [13]:
%%time
# Here we return the average between the average rating for the movie and the average rating given by the user
def rate_average_movie_and_user(uid, mid):
    return (rate_movie_average(uid, mid) + rate_user_average(uid, mid)) / 2

#test_rate_on_train_data(rate_average_movie_and_user, 1000)
# test_rate_on_solution(rate_average_movie_and_user, -1)
# generate_submission(rate_average_movie_and_user)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 34.8 µs


### 3.5 User-based Collaborative Filtering

In [14]:
%%time
def rate_user_based(uid, mid):
    nom, denom = 0, 0
    if mid in ratings_per_mid_uid:
        for u2 in ratings_per_mid_uid[mid]:
            sim = pcc_data[uid][u2]
            nom += sim * (ratings_per_mid_uid[mid][u2] - avg_rat_per_user[u2])
            denom += abs(sim)
    
    avg = avg_rat_per_user[uid]
    frac = 0
    if denom != 0:
        frac = nom / denom # TODO: why is denom sometimes 0?
    return avg + frac

# test_rate_on_train_data(rate_user_based, 1000)
# test_rate_on_solution(rate_user_based, -1)
# generate_submission(rate_user_based)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 11 µs


### 3.6 Item-based Collaborative Filtering

In [15]:
%%time
def rate_item_based(uid, mid):
    nom = 0
    denom = 0
    for mid2 in user_rating_db[uid]:
        idx1 = mid_list[mid]
        idx2 = mid_list[mid2]
        
        sim = cosine_sims[idx1][idx2]
        nom += sim * user_rating_db[uid][mid2]
        denom += abs(sim)
    
    if denom == 0:
        return 3.5 # this happens if the scraped data didn't contain an overview for the filem with id "mid"
    rating = nom / denom
    return rating

# test_rate_on_train_data(rate_item_based, 1000)
# test_rate_on_solution(rate_item_based, -1)
# generate_submission(rate_item_based)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 10.3 µs


### 3.7 Average Item-based User-based Collaborative Filtering

In [16]:
%%time
def rate_item_user_based(uid, mid):
    return (rate_user_based(uid, mid) + rate_item_based(uid, mid)) / 2

# test_rate_on_train_data(rate_item_user_based, 1000)
# test_rate_on_solution(rate_item_user_based, -1)
generate_submission(rate_item_user_based)

CPU times: user 19.6 s, sys: 4.21 ms, total: 19.6 s
Wall time: 19.6 s


### 3.8 Average over all previous methods

In [17]:
%%time
def rate_all_average(uid, mid):
    rat = (rate_movie_average(uid, mid) + rate_user_average(uid, mid)
            + rate_user_based(uid, mid) + rate_item_based(uid, mid)) / 4.
    return rat

# test_rate_on_train_data(rate_all_average, 1000)
# test_rate_on_solution(rate_all_average, -1)
# generate_submission(rate_all_average)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.54 µs


### 3.9 Neural Network using Tensorflow
#### 3.9.1 Generating Inputs and Outputs

In [18]:
"""
%%time
# Neural Net Inputs and Outputs
def getGenreBitmap(mid):
    bitmap = []
    genres = ['Action', 'Adventure', 'Animation', "Children's Comedy", 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)']
    for genre in genres:
        bitmap.append((0,1)[genre in movie_data[mid]['genres']])
    return bitmap

def getUserBitmap(uid):
    bitmap = len(users) * [0]
    bitmap[users.index(uid)] = 1
    return bitmap

def getMovieBitmap(mid):
    bitmap = len(movies) * [0]
    bitmap[movies.index(mid)] = 1
    return bitmap

def getData(uid, mid):
    mBitmap = getGenreBitmap(mid)
    uBitmap = getUserBitmap(uid)
    return mBitmap + uBitmap

nn_training = {'in': [], 'out': []}
for uid, mid, rating, _ in train_data:
    nn_training['in'].append(getData(uid, mid))
    nn_training['out'].append(rating)

nn_testing = {'in': [], 'out': []}
for uid, mid, rating, _ in solution_data:
    nn_testing['in'].append(getData(uid, mid))
    nn_testing['out'].append(rating)

X_train, X_test, y_train, y_test = train_test_split(np.array(nn_training['in']), np.array(nn_training['out']))
"""

'\n%%time\n# Neural Net Inputs and Outputs\ndef getGenreBitmap(mid):\n    bitmap = []\n    genres = [\'Action\', \'Adventure\', \'Animation\', "Children\'s Comedy", \'Crime\', \'Documentary\', \'Drama\', \'Fantasy\', \'Film-Noir\', \'Horror\', \'Musical\', \'Mystery\', \'Romance\', \'Sci-Fi\', \'Thriller\', \'War\', \'Western\', \'(no genres listed)\']\n    for genre in genres:\n        bitmap.append((0,1)[genre in movie_data[mid][\'genres\']])\n    return bitmap\n\ndef getUserBitmap(uid):\n    bitmap = len(users) * [0]\n    bitmap[users.index(uid)] = 1\n    return bitmap\n\ndef getMovieBitmap(mid):\n    bitmap = len(movies) * [0]\n    bitmap[movies.index(mid)] = 1\n    return bitmap\n\ndef getData(uid, mid):\n    mBitmap = getGenreBitmap(mid)\n    uBitmap = getUserBitmap(uid)\n    return mBitmap + uBitmap\n\nnn_training = {\'in\': [], \'out\': []}\nfor uid, mid, rating, _ in train_data:\n    nn_training[\'in\'].append(getData(uid, mid))\n    nn_training[\'out\'].append(rating)\n\nnn_t

#### 3.9.2 Neural Network Definition

In [19]:
"""
%%time
model = tf.keras.Sequential([
    tf.keras.layers.Dense(300, activation='relu', input_shape=(len(nn_training['in'][0]),)),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

model.summary()
"""

"\n%%time\nmodel = tf.keras.Sequential([\n    tf.keras.layers.Dense(300, activation='relu', input_shape=(len(nn_training['in'][0]),)),\n    tf.keras.layers.Dense(200, activation='relu'),\n    tf.keras.layers.Dense(50, activation='relu'),\n    tf.keras.layers.Dense(1, activation='linear')\n])\n\nmodel.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])\n\nmodel.summary()\n"

#### 3.9.3 Train Neural Network

In [20]:
"""
%%time
# Neural Net Training
# TODO: do this in preprocessing and save the resulting network in a file
num_epochs = 5
model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))
loss, mae = model.evaluate(X_test, y_test)
print(f'Mean Absolute Error: {mae}')
"""

"\n%%time\n# Neural Net Training\n# TODO: do this in preprocessing and save the resulting network in a file\nnum_epochs = 5\nmodel.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))\nloss, mae = model.evaluate(X_test, y_test)\nprint(f'Mean Absolute Error: {mae}')\n"

#### 3.9.4 Use the Neural Network for predictions

In [21]:
"""%%time
# Test using sample solution
def nn_predict(data):
    predictions_raw = model.predict(np.array(data))
    calculated = []
    for pred in predictions_raw:
        calculated.append(half_star(pred[0]))
    return calculated

calculated = nn_predict(nn_testing['in'])
actual = nn_testing['out']
rsme = RSME(calculated, actual)
print(f"RSME: {rsme}")
"""

'%%time\n# Test using sample solution\ndef nn_predict(data):\n    predictions_raw = model.predict(np.array(data))\n    calculated = []\n    for pred in predictions_raw:\n        calculated.append(half_star(pred[0]))\n    return calculated\n\ncalculated = nn_predict(nn_testing[\'in\'])\nactual = nn_testing[\'out\']\nrsme = RSME(calculated, actual)\nprint(f"RSME: {rsme}")\n'

#### 3.9.5 Generate submission based on Neural Net predictions

In [22]:
# Generate the submission
def generate_nn_submission():
    f = open(SUBMISSION, 'w')
    f.write("Id,rating\n")
    for i, rating in enumerate(calculated):
        f.write(f"{i},{rating}\n")
    f.close()

# generate_nn_submission()