In [1]:
from IPython.display import Markdown, display # for markdown text
import json # for json methods
import pprint # to print human readable dictionary
import pandas as pd # for visualizations
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer # for TF-IDF

In [2]:
def printmd(string):
    display(Markdown(string))

# FETCHING THE DATA

## All the topics in our database

In [3]:
topics = json.load(open('topics.txt'))
pprint.pprint(topics)

{'16': 'openhardware',
 '18': 'Data Science',
 '19': 'Big Data',
 '20': 'Artificial Intelligence',
 '21': 'Business Intelligence',
 '31': 'arduino',
 '32': 'raspberry pi',
 '33': '3d printer',
 '36': 'Deep Learning',
 '37': 'IoT',
 '38': '3d printing',
 '39': 'open hardware',
 '56': 'Wearable',
 '57': 'Sustainable finance',
 '59': 'Sustainable Finance',
 '60': 'Climate Finance',
 '61': 'Green Bonds',
 '62': 'Green Economy'}


## The parameters in our scenario
We are observing an audience, which is defined by two constraints: a topic and a location. Our example is the audience in Italy interested in the topic: Arduino.

In [4]:
TOPIC_ID=31 # topic = arduino
LOCATION = 'italy'
SIGNAL_STRENGTH = 0 # this value indicates the min number of influencers the retrieved audience members follow within the topic
LIMIT = 40 # number of audience members to consider
TESTING_SET_SIZE=10
HOW_MANY_TWEETS = 50 # amount of most recent tweets (including retweets) to be retrieved to consider in our recommendation engine
INCLUDE_RETWEETS = True

In [5]:
rated_audience_dict = json.load(open('rated_audience.txt'))
unrated_audience_dict = json.load(open('unrated_audience.txt'))

In [6]:
printmd("## An example Twitter profile with all the data fields at this point.")
printmd("### Topic: " + topics[str(TOPIC_ID)])
pprint.pprint(next (iter (rated_audience_dict.values())))

## An example Twitter profile with all the data fields at this point.

### Topic: arduino

{'description': 'Chief Innovation Officer at NTT DATA Italia , proud father of '
                'two beautiful girls, a professor, a technology evangelist, an '
                'holistic thinker and a gentleman.',
 'ground_truth_rating': 1.0,
 'hashtags': 'Milano CheTempoFa Milano CheTempoFa foi13 Cefriel fvw2013 '
             'fvw2013 fvw2013 storytelling Vajont Milano CheTempoFa '
             'StartupWeekend GrandC4Picasso makerfairerome MakerFaireRome '
             'GrandC4Picasso GrandC4Picasso',
 'influencers': '266400754 84094835 767285',
 'location': 'Milan, Italy',
 'screen_name': 'funkysurfer',
 'tweets': 'at Cascina Matiot Disorders of Con Edi Touch is easier than yours '
           'via Startup weekend 3 days at Milan Decoded Milan capital of the '
           'street October 2013 Rain Massima Minima The new science is an open '
           'narration thanks to This project was born via Milan handmade with '
           'Craft Camp of a Pisa ItCup A Roncade in Veneto via Oc

In [7]:
# Computes the TF-IDF values for the given corpus.
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = 'english');
def get_tfidf(corpus):
    return tf.fit_transform(corpus.copy()).todense();

## Separate the data into different arrays

In [8]:
screen_names = [aud['screen_name'] for aud in rated_audience_dict.values()]
influencers_corpus = [aud['influencers'] for aud in rated_audience_dict.values()]
tweets_corpus = [aud['tweets'] for aud in rated_audience_dict.values()]
hashtags_corpus = [aud['hashtags'] for aud in rated_audience_dict.values()]
description_corpus = [aud['description'] for aud in rated_audience_dict.values()]
ground_truth_ratings = np.array([2*aud['ground_truth_rating'] for aud in rated_audience_dict.values()])
#print(tweets_corpus)

In [115]:
# INFLUENCER MATRICES
tfidf_influencer_matrix = np.array(get_tfidf(influencers_corpus)).T;
binary_influencer_matrix = tfidf_influencer_matrix.copy()
binary_influencer_matrix[binary_influencer_matrix>0]=1
# TWEET MATRICES
tfidf_tweets_matrix=np.array(get_tfidf(tweets_corpus)).T;
binary_tweets_matrix = tfidf_tweets_matrix.copy()
binary_tweets_matrix[binary_tweets_matrix>0]=1
# DESCRIPTION MATRICES
tfidf_description_matrix = np.array(get_tfidf(description_corpus)).T;
binary_description_matrix = tfidf_description_matrix.copy()
binary_description_matrix[binary_description_matrix>0]=1
# HASHTAGS MATRICES
tfidf_hashtags_matrix = np.array(get_tfidf(hashtags_corpus)).T;
binary_hashtags_matrix = tfidf_hashtags_matrix.copy()
binary_hashtags_matrix[binary_hashtags_matrix>0]=1
printmd("#### Matrix dimensions")
printmd("Influencer matrix: " + str(tfidf_influencer_matrix.shape))
printmd("Tweets matrix: " +str(tfidf_tweets_matrix.shape))
printmd("Description matrix: " +str(tfidf_description_matrix.shape))
printmd("Hashtags matrix: " +str(tfidf_hashtags_matrix.shape))

#### Matrix dimensions

Influencer matrix: (21, 40)

Tweets matrix: (4182, 40)

Description matrix: (250, 40)

Hashtags matrix: (974, 40)

# METHODS FOR RECOMMENDATION

# Method A
## Logistic Regression using the similarities coming from user profiling (fixed parameters)

In [10]:
from tfidf_vectorizer import TwitterAccountSimilarityFinder
import mord as md
from sklearn import datasets, linear_model

In [11]:
from scipy import sparse
def get_user_profile(tfidf_matrix, ratings):
    return np.multiply(tfidf_matrix.T,ratings[:, np.newaxis]).sum(axis=0).reshape(1,tfidf_matrix.shape[0])

In [12]:
def cos_sim(tfidf_matrix, user_profile):
    norm = np.linalg.norm(user_profile);
    return 1.0* tfidf_matrix.T.dot(user_profile.T)/ norm
    

In [13]:
SUBSAMPLING_COUNT = 100 # we will subsample this many times and take the average error for evaluation
rated_audience_dict_ids = list(rated_audience_dict.keys())

In [390]:
mean_square_errors = []
prints_enabled = False
for iteration in range(SUBSAMPLING_COUNT):
    predictions=[]
    TESTING_SET_IDS = np.random.choice(rated_audience_dict_ids, size=TESTING_SET_SIZE, replace=False)
    if (prints_enabled): print(TESTING_SET_IDS)
    ratings = np.array([0 if id in TESTING_SET_IDS else int(2*aud['ground_truth_rating']) for id, aud in rated_audience_dict.items()])
    if (prints_enabled): display(ratings)
        
    # FIND USER PROFILES
    influencer_user_profile = get_user_profile(tfidf_influencer_matrix, ratings)
    tweets_user_profile = get_user_profile(tfidf_tweets_matrix, ratings)
    description_user_profile = get_user_profile(tfidf_description_matrix, ratings)
    hashtags_user_profile = get_user_profile(tfidf_hashtags_matrix, ratings)

    # FIND THE COSINE SIMILARITIES
    # THESE WILL THEN BE USED AS FEATURES IN REGRESSION
    influencerSimilarities = cos_sim(tfidf_influencer_matrix, influencer_user_profile)
    tweetSimilarities = cos_sim(tfidf_tweets_matrix, tweets_user_profile)
    descriptionSimilarities = cos_sim(tfidf_description_matrix, description_user_profile)
    hashtagSimilarities = cos_sim(tfidf_hashtags_matrix, hashtags_user_profile)
    
    profile_count = len(rated_audience_dict)
    avgTweetSim = np.mean([e for e in tweetSimilarities if e!=0]) # if we cannot fetch tweets of a profile, we assign his tweets an average similarity score
    avgDescriptionSim = np.mean([e for e in descriptionSimilarities if e!=0]) # if the description of a profile is empty, we assign his descripion an average similarity score
    avgHashtagSim = np.mean([e for e in hashtagSimilarities if e!=0]) # if we cannot fetch tweets of a profile, we assign his hashtags an average similarity score
    
    profiles =[]
    training_indices=[]
    for i in range(profile_count):
        if tweetSimilarities[i]==0: tweetSimilarities[i]= avgTweetSim
        if descriptionSimilarities[i]==0: descriptionSimilarities[i]= avgDescriptionSim
        if hashtagSimilarities[i]==0: hashtagSimilarities[i] = avgHashtagSim
        if ratings[i]!=0: training_indices.append(i)
        profile = {
        'index': i,
        'screen_name':screen_names[i],
        'set': "testing" if ratings[i]==0 else "training",
        'ground_truth':ground_truth_ratings[i],
        'infSim': influencerSimilarities[i].item(0),
        'tweetSim': tweetSimilarities[i].item(0),
        'descSim': descriptionSimilarities[i].item(0),
        'hashtagSim': hashtagSimilarities[i].item(0),
        'score': 0
        }
        count=0
        for key in profile.keys():
            if 'Sim' in key: 
                profile['score']+=profile[key]
                count+=1
        profile['score']/=count*1.0
        profiles.append(profile)

    profiles=pd.DataFrame(profiles)
    if (prints_enabled):
        display(profiles[['screen_name','set','ground_truth','score','infSim','tweetSim','descSim','hashtagSim']].sort_values(by='score',ascending=False).round(2))
        printmd("### Naive approach\n Score is the average of similarities.")
        profiles.plot.scatter('score','ground_truth') # score is the average of the similarities
    
    # Combine the similarities and use them as features to feed to a logistic regressor.
    # uncomment to add the similarity into regression

    featureVectors = influencerSimilarities
    featureVectors = np.column_stack((featureVectors,tweetSimilarities))
    #featureVectors = np.column_stack((featureVectors, descriptionSimilarities))
    #featureVectors = np.column_stack((featureVectors, hashtagSimilarities))

    X = np.array([featureVectors[i] for i in training_indices])
    X_binary = X.copy()
    X_binary[X_binary>0]=1
    
    Y = np.array([ratings[i] for i in training_indices])
    Y_binary = [round(e/10) for e in Y]
    
    # Ordinal Regression
    #classifier = linear_model.LinearRegression()
    # Logistic Regression
    #classifier = md.LogisticIT() #Default parameters: alpha=1.0, verbose=0, maxiter=10000
    
    classifier = linear_model.LogisticRegression(C=1e5)
    #print(X)
    #print(Y)
    classifier.fit(X, Y)

    predictions = classifier.predict(featureVectors)
    profiles['predicted_rating']=predictions
    profiles["squared_error"]=(profiles["ground_truth"]-profiles["predicted_rating"])**2
    if (prints_enabled):
        display(profiles[['screen_name','set','ground_truth', 'predicted_rating','squared_error','score','infSim','tweetSim','descSim','hashtagSim']].sort_values(by='score',ascending=False).round(2))

    evaluation = pd.DataFrame()
    evaluation['mean_squared_error']=profiles.groupby(by='set')['squared_error'].mean()
    mean_square_error = {
        'training':evaluation.filter(like='training', axis=0)['mean_squared_error'].iloc[0],
        'testing':evaluation.filter(like='testing', axis=0)['mean_squared_error'].iloc[0],
        }
    mean_square_errors.append(mean_square_error)
    
Avg_MSE_training = np.mean([e['training'] for e in mean_square_errors])
Avg_MSE_testing = np.mean([e['testing'] for e in mean_square_errors])
print("Avg Training Mean Squared Error: " + str(Avg_MSE_training))
print("Avg Testing Mean Squared Error: " + str(Avg_MSE_testing))

Avg Training Mean Squared Error: 2.2795
Avg Testing Mean Squared Error: 14.597


# Method B
## Logistic Regression using the similarities coming from user profiling (relaxed parameters)

# MATRIX METHODS

In [14]:
import math

In [15]:
# Matrix Factorization via multiplicative update rule
# Original Author: Ali Taylan Cemgil from Bogazici University
def nmf_kl_multiplicative(D, M, W, H, EPOCH=5000):
    MD = D.copy()
    MD[M==0] = 0
    for e in range(EPOCH):
        Xhat = W.dot(H)
        W=W*np.array(((MD/Xhat).dot(H.T)/np.dot(M, H.T)))
        Xhat = W.dot(H)
        H = H*np.array((W.T.dot(MD/Xhat)/np.dot(W.T, M)))
        #print(np.sum(np.abs(MD - M*Xhat))/np.sum(M))
    return W, H

In [16]:
# Regularized Matrix Factorization 
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        if(i==0):
                            P[i][k] = P[i][k] + alpha * (4 * eij * Q[k][j] - beta * P[i][k])
                        else:         
                            P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        if(k==0):
                            Q[k][j] = Q[k][j] + alpha * (4 * eij * P[i][k] - beta * Q[k][j])
                        else:
                            Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

In [116]:
tfidf_data_matrix =[]
tfidf_data_matrix = tfidf_influencer_matrix # first add influencers
tfidf_data_matrix = np.append(tfidf_data_matrix,tfidf_tweets_matrix,axis=0) # then tweets
tfidf_data_matrix = np.append(tfidf_data_matrix,tfidf_description_matrix,axis=0) # then descriptions
tfidf_data_matrix = np.append(tfidf_data_matrix,tfidf_hashtags_matrix,axis=0) # then hashtags

binary_data_matrix =[]
binary_data_matrix = binary_influencer_matrix # first add influencers
binary_data_matrix = np.append(binary_data_matrix,binary_tweets_matrix,axis=0) # then tweets
binary_data_matrix = np.append(binary_data_matrix,binary_description_matrix,axis=0) # then descriptions
binary_data_matrix = np.append(binary_data_matrix,binary_hashtags_matrix,axis=0) # then hashtags

#display(pd.DataFrame(binary_data_matrix))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Method C 
## Non-negative Matrix Factorization (NMF) without the ratings row = Document Clustering

In [99]:
mean_square_errors = []
prints_enabled = False
#Rank
R = 10
# Data
Nr = tfidf_data_matrix.shape[0]
Nc = tfidf_data_matrix.shape[1]

for iteration in range(SUBSAMPLING_COUNT):
    predictions=[]
    TESTING_SET_IDS = np.random.choice(rated_audience_dict_ids, size=TESTING_SET_SIZE, replace=False)
    #if (prints_enabled): print(TESTING_SET_IDS)
    ratings = np.array([0 if id in TESTING_SET_IDS else int(2*aud['ground_truth_rating']) for id, aud in rated_audience_dict.items()])
    if (prints_enabled): 
        printmd("Ratings:")
        display(ratings)

    # Initialize W and H with random numbers
    W = np.random.rand(Nr, R)*100
    H = np.random.rand(R, Nc)*100

    Mask = np.ones_like(tfidf_data_matrix)
    Mask[np.isnan(tfidf_data_matrix)] = 0

    W,H = nmf_kl_multiplicative(tfidf_data_matrix, Mask, W, H, EPOCH=1)

    # Cluster numbers holds which cluster each user is assigned to
    cluster_numbers = np.argmax(H,axis=0)
    # Ratings each cluster is assigned to (initialized to zeros)
    cluster_ratings = np.zeros(R)

    for i in range(len(cluster_numbers)):
        cluster_no = cluster_numbers[i]
        cluster_ratings[cluster_no]+=1.0*ratings[i] 

    for i in range(R):
        cluster_size = float(len(cluster_numbers[cluster_numbers==i]))
        if cluster_size!=0:
            cluster_ratings[i]/=cluster_size
   
    predictions = [np.around(cluster_ratings[cluster_no]) for cluster_no in cluster_numbers]    
    # Found the cluster ratings
    cluster_ratings = [{'cluster_no':i, 'rating':cluster_ratings[i]} for i in range(R)]
    cluster_ratings = pd.DataFrame(cluster_ratings).round(2)
    if (prints_enabled): 
        printmd("Cluster numbers:")
        display(cluster_numbers)
        printmd("Cluster ratings:")
        display(cluster_ratings)
    
    results = pd.DataFrame()
    results['set'] = ['training' if ratings[i]!=0 else 'testing' for i in range(len(ratings)) ]
    results['ground_truth']=ground_truth_ratings
    results['predicted_rating']=predictions
    results["squared_error"]=(results["ground_truth"]-results["predicted_rating"])**2
    
    if (prints_enabled):
        display(results[['set','ground_truth','predicted_rating','squared_error']].round(2))

    evaluation = pd.DataFrame()
    evaluation['mean_squared_error']=results.groupby(by='set')['squared_error'].mean()
    mean_square_error = {
        'training':evaluation.filter(like='training', axis=0)['mean_squared_error'].iloc[0],
        'testing':evaluation.filter(like='testing', axis=0)['mean_squared_error'].iloc[0],
        }
    mean_square_errors.append(mean_square_error)
    
Avg_MSE_training = np.mean([e['training'] for e in mean_square_errors])
Avg_MSE_testing = np.mean([e['testing'] for e in mean_square_errors])
print("Avg Training Mean Squared Error: " + str(Avg_MSE_training))
print("Avg Testing Mean Squared Error: " + str(Avg_MSE_testing))
    

Avg Training Mean Squared Error: 6.98333333333
Avg Testing Mean Squared Error: 12.642


# Method D 
## Non-negative Matrix Factorization (NMF) with the ratings row

In [126]:
mean_square_errors = []
prints_enabled = False
#Rank
R = 10
# Data
Nr = tfidf_data_matrix.shape[0]+1
Nc = tfidf_data_matrix.shape[1]

for iteration in range(SUBSAMPLING_COUNT):
    predictions=[]
    TESTING_SET_IDS = np.random.choice(rated_audience_dict_ids, size=TESTING_SET_SIZE, replace=False)
    #if (prints_enabled): print(TESTING_SET_IDS)
    ratings = np.array([0 if id in TESTING_SET_IDS else int(2*aud['ground_truth_rating']) for id, aud in rated_audience_dict.items()])
    if (prints_enabled): 
        printmd("Ratings:")
        display(ratings)

    tfidf_data_matrix_with_ratings = ratings.reshape(1,len(ratings))
    tfidf_data_matrix_with_ratings = np.append(tfidf_data_matrix_with_ratings,tfidf_data_matrix,axis=0)

    # Initialize W and H with random numbers
    W = np.random.rand(Nr, R)*100
    H = np.random.rand(R, Nc)*100

    Mask = np.ones_like(tfidf_data_matrix_with_ratings)
    Mask[np.isnan(tfidf_data_matrix_with_ratings)] = 0

    W,H = nmf_kl_multiplicative(tfidf_data_matrix_with_ratings, Mask, W, H, EPOCH=1)
    Xhat = W.dot(H)
    predictions = Xhat[0]
    predictions = np.around(10.0*predictions/max(predictions))
    
    results = pd.DataFrame()
    results['set'] = ['training' if ratings[i]!=0 else 'testing' for i in range(len(ratings)) ]
    results['ground_truth']=ground_truth_ratings
    results['predicted_rating']=predictions
    results["squared_error"]=(results["ground_truth"]-results["predicted_rating"])**2
    
    if (prints_enabled):
        display(results[['set','ground_truth','predicted_rating','squared_error']].round(2))

    evaluation = pd.DataFrame()
    evaluation['mean_squared_error']=results.groupby(by='set')['squared_error'].mean()
    mean_square_error = {
        'training':evaluation.filter(like='training', axis=0)['mean_squared_error'].iloc[0],
        'testing':evaluation.filter(like='testing', axis=0)['mean_squared_error'].iloc[0],
        }
    mean_square_errors.append(mean_square_error)
    
Avg_MSE_training = np.mean([e['training'] for e in mean_square_errors])
Avg_MSE_testing = np.mean([e['testing'] for e in mean_square_errors])
print("Avg Training Mean Squared Error: " + str(Avg_MSE_training))
print("Avg Testing Mean Squared Error: " + str(Avg_MSE_testing))

Avg Training Mean Squared Error: 9.347
Avg Testing Mean Squared Error: 9.592


In [152]:
mean_square_errors = []
prints_enabled = False
#Rank
R = 10
# Data
Nr = tfidf_data_matrix.shape[0]+1
Nc = tfidf_data_matrix.shape[1]
for T in np.arange(1,10,0.1):
    for iteration in range(1):
        predictions=[]
        TESTING_SET_IDS = np.random.choice(rated_audience_dict_ids, size=TESTING_SET_SIZE, replace=False)
        #if (prints_enabled): print(TESTING_SET_IDS)
        ratings = np.array([0 if id in TESTING_SET_IDS else int(2*aud['ground_truth_rating']) for id, aud in rated_audience_dict.items()])
        if (prints_enabled): 
            printmd("Ratings:")
            display(ratings)

        tfidf_data_matrix_with_ratings = ratings.reshape(1,len(ratings))
        tfidf_data_matrix_with_ratings = np.append(tfidf_data_matrix_with_ratings,tfidf_data_matrix,axis=0)

        # Initialize W and H with random numbers
        W = np.random.rand(Nr, R)*100
        H = np.random.rand(R, Nc)*100

        Mask = np.ones_like(tfidf_data_matrix_with_ratings)
        Mask[np.isnan(tfidf_data_matrix_with_ratings)] = 0

        W,H = nmf_kl_multiplicative(tfidf_data_matrix_with_ratings, Mask, W, H, EPOCH=1)
        Xhat = W.dot(H)
        predictions = Xhat[0]
        predictions = np.around(10.0*predictions/max(predictions))


        results = pd.DataFrame()
        results['set'] = ['training' if ratings[i]!=0 else 'testing' for i in range(len(ratings)) ]
        results['ground_truth']=[0 if r<=T else 1 for r in ground_truth_ratings] # binary ground truth
        results['predicted_rating']=[0 if r<=T else 1 for r in predictions] # binary predictions
        results["squared_error"]=(results["ground_truth"]-results["predicted_rating"])**2

        if (prints_enabled):
            display(results[['set','ground_truth','predicted_rating','squared_error']].round(2))

        evaluation = pd.DataFrame()
        evaluation['mean_squared_error']=results.groupby(by='set')['squared_error'].mean()
        mean_square_error = {
            'training':evaluation.filter(like='training', axis=0)['mean_squared_error'].iloc[0],
            'testing':evaluation.filter(like='testing', axis=0)['mean_squared_error'].iloc[0],
            }
        mean_square_errors.append(mean_square_error)
        
    #Avg_MSE_training = np.mean([e['training'] for e in mean_square_errors])
    Avg_MSE_testing = np.mean([e['testing'] for e in mean_square_errors])
    #print("Avg Training Mean Squared Error: " + str(Avg_MSE_training))
    print("Avg Testing Mean Squared Error: " + str(Avg_MSE_testing) + " Threshold: " + str(T))
    


Avg Testing Mean Squared Error: 0.1 Threshold: 1.0
Avg Testing Mean Squared Error: 0.15 Threshold: 1.1
Avg Testing Mean Squared Error: 0.133333333333 Threshold: 1.2
Avg Testing Mean Squared Error: 0.1 Threshold: 1.3
Avg Testing Mean Squared Error: 0.1 Threshold: 1.4
Avg Testing Mean Squared Error: 0.1 Threshold: 1.5
Avg Testing Mean Squared Error: 0.1 Threshold: 1.6
Avg Testing Mean Squared Error: 0.0875 Threshold: 1.7
Avg Testing Mean Squared Error: 0.0777777777778 Threshold: 1.8
Avg Testing Mean Squared Error: 0.07 Threshold: 1.9
Avg Testing Mean Squared Error: 0.0909090909091 Threshold: 2.0
Avg Testing Mean Squared Error: 0.125 Threshold: 2.1
Avg Testing Mean Squared Error: 0.146153846154 Threshold: 2.2
Avg Testing Mean Squared Error: 0.171428571429 Threshold: 2.3
Avg Testing Mean Squared Error: 0.186666666667 Threshold: 2.4
Avg Testing Mean Squared Error: 0.1875 Threshold: 2.5
Avg Testing Mean Squared Error: 0.194117647059 Threshold: 2.6
Avg Testing Mean Squared Error: 0.2055555555