In [89]:
from IPython.display import Markdown, display # for markdown text
import json # for json methods
import pprint # to print human readable dictionary
import pandas as pd # for visualizations
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer # for TF-IDF

In [90]:
def printmd(string):
    display(Markdown(string))

In [91]:
# Computes the TF-IDF values for the given corpus.
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = 'english');
def get_tfidf(corpus):
    return tf.fit_transform(corpus.copy()).todense();

# DATA PREPARATION

## All the topics in our database

In [92]:
topics = json.load(open('topics.txt'))
pprint.pprint(topics)

{'16': 'openhardware',
 '18': 'Data Science',
 '19': 'Big Data',
 '20': 'Artificial Intelligence',
 '21': 'Business Intelligence',
 '31': 'arduino',
 '32': 'raspberry pi',
 '33': '3d printer',
 '36': 'Deep Learning',
 '37': 'IoT',
 '38': '3d printing',
 '39': 'open hardware',
 '56': 'Wearable',
 '57': 'Sustainable finance',
 '59': 'Sustainable Finance',
 '60': 'Climate Finance',
 '61': 'Green Bonds',
 '62': 'Green Economy'}


## The parameters in our scenario
We are observing an audience, which is defined by two constraints: a topic and a location. Our example is the audience in Italy interested in the topic: Arduino.

In [93]:
TOPIC_ID=31 # topic = arduino
LOCATION = 'italy'
SIGNAL_STRENGTH = 0 # this value indicates the min number of influencers the retrieved audience members follow within the topic
LIMIT = 20 # number of audience members to consider
TESTING_SET_SIZE=5 
HOW_MANY_TWEETS = 50 # amount of most recent tweets (including retweets) to be retrieved to consider in our recommendation engine
INCLUDE_RETWEETS = True

In [94]:
rated_audience_dict = json.load(open('rated_audience.txt'))
unrated_audience_dict = json.load(open('unrated_audience.txt'))

In [95]:
printmd("## An example Twitter profile with all the data fields at this point.")
printmd("### Topic: " + topics[str(TOPIC_ID)])
pprint.pprint(next (iter (rated_audience_dict.values())))

## An example Twitter profile with all the data fields at this point.

### Topic: arduino

{'description': 'Chief Innovation Officer at NTT DATA Italia , proud father of '
                'two beautiful girls, a professor, a technology evangelist, an '
                'holistic thinker and a gentleman.',
 'ground_truth_rating': 1.0,
 'hashtags': 'Milano CheTempoFa Milano CheTempoFa foi13 Cefriel fvw2013 '
             'fvw2013 fvw2013 storytelling Vajont Milano CheTempoFa '
             'StartupWeekend GrandC4Picasso makerfairerome MakerFaireRome '
             'GrandC4Picasso GrandC4Picasso',
 'influencers': '266400754 84094835 767285',
 'location': 'Milan, Italy',
 'screen_name': 'funkysurfer',
 'tweets': 'at Cascina Matiot Disturbi di Con Edi Touch è più facile il tuo '
           'quello via Startup weekend 3 giorni al A Milano Decoded Milano '
           'capitale della via October 2013 Rain Massima Minima La nuova '
           'scienza è una narrazione aperta grazie ad Questo progetto nasce '
           'per via Milano handmade col Craft Camp di a Pisa ItCup A Roncade 

## Separate the data into different arrays

In [96]:
screen_names = [aud['screen_name'] for aud in rated_audience_dict.values()]
influencers_corpus = [aud['influencers'] for aud in rated_audience_dict.values()]
tweets_corpus = [aud['tweets'] for aud in rated_audience_dict.values()]
hashtags_corpus = [aud['hashtags'] for aud in rated_audience_dict.values()]
description_corpus = [aud['description'] for aud in rated_audience_dict.values()]
ground_truth_ratings = np.array(2*[aud['ground_truth_rating'] for aud in rated_audience_dict.values()])
#print(tweets_corpus)

In [97]:
# INFLUENCER MATRICES
tfidf_influencer_matrix = np.array(get_tfidf(influencers_corpus)).T;
binary_influencer_matrix = tfidf_influencer_matrix.copy()
binary_influencer_matrix[binary_influencer_matrix>0]=1

# TWEET MATRICES
tfidf_tweet_matrix=np.array(get_tfidf(tweets_corpus)).T;
binary_tweet_matrix = tfidf_tweet_matrix.copy()
binary_tweet_matrix[binary_tweet_matrix>0]=1

# DESCRIPTION MATRICES
tfidf_description_matrix = np.array(get_tfidf(description_corpus)).T;
binary_description_matrix = tfidf_description_matrix.copy()
binary_description_matrix[binary_description_matrix>0]=1

# HASHTAGS MATRICES
tfidf_hashtags_matrix = np.array(get_tfidf(hashtags_corpus)).T;
binary_hashtags_matrix = tfidf_hashtags_matrix.copy()
binary_hashtags_matrix[binary_hashtags_matrix>0]=1

In [98]:
# Matrix Factorization via multiplicative update rule
def nmf_kl_multiplicative(D, M, W, H, EPOCH=5000):
    MD = D.copy()
    MD[M==0] = 0
    for e in range(EPOCH):
        Xhat = W.dot(H)
        W=W*np.array(((MD/Xhat).dot(H.T)/np.dot(M, H.T)))
        Xhat = W.dot(H)
        H = H*np.array((W.T.dot(MD/Xhat)/np.dot(W.T, M)))
        #print(np.sum(np.abs(MD - M*Xhat))/np.sum(M))
    return W, H

In [99]:
# Regularized Matrix Factorization 
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        if(i==0):
                            P[i][k] = P[i][k] + alpha * (4 * eij * Q[k][j] - beta * P[i][k])
                        else:         
                            P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        if(k==0):
                            Q[k][j] = Q[k][j] + alpha * (4 * eij * P[i][k] - beta * Q[k][j])
                        else:
                            Q[k][j] = Q[k][j] + alpha * (4 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

# 

# Method A
## Logistic Regression using the similarities coming from user profiling (fixed parameters)

# Method B
## Logistic Regression using the similarities coming from user profiling (relaxed parameters)

In [104]:
##Example Ordinal Regression
import mord as md

c = md.LogisticIT() #Default parameters: alpha=1.0, verbose=0, maxiter=10000
c.fit(np.array([[1,0,0,1],[0,1,0,0],[1,0,0,0]]), np.array([1,2,3]))
c.predict(np.array([0,0,0,1]))
c.predict(np.array([0,1,0,0]))
c.predict(np.array([1,0,0,0]))

print(c.predict(np.array([0,1,0,0])))



[2]


# Method C 
## Non-negative Matrix Factorization (NMF) without the ratings row = Document Clustering

In [100]:
import math
#Rank
R = 10
# Data
Nr = tfidf_tweet_matrix.shape[0]
Nc = tfidf_tweet_matrix.shape[1]

# Initialize W and H with random numbers
W = np.random.rand(Nr, R)*100
H = np.random.rand(R, Nc)*100

Mask = np.ones_like(tfidf_tweet_matrix)
Mask[np.isnan(tfidf_tweet_matrix)] = 0

W,H = nmf_kl_multiplicative(tfidf_tweet_matrix, Mask, W, H, EPOCH=1)

cluster_numbers = np.argmax(H,axis=0)
cluster_ratings = np.array([0,0,0,0,0,0,0,0,0,0])

print(cluster_numbers)
print(ground_truth_ratings)
for i in range(len(cluster_numbers)):
    cluster_no = cluster_numbers[i]
    cluster_ratings[cluster_no]+=1.0*ground_truth_ratings[i] 
    

for i in range(R):
    cluster_ratings[i]/=float(len(cluster_numbers[cluster_numbers==i]))
    
print(cluster_ratings)



[3 2 6 8 0 9 3 1 5 7 2 1 0 7 8 7 0 8 5 9]
[ 1.   0.5  3.   2.   2.5  5.   5.   5.   1.5  2.5  3.   4.   3.5  1.5  1.
  1.   2.5  1.   1.   2.5  1.   0.5  3.   2.   2.5  5.   5.   5.   1.5  2.5
  3.   4.   3.5  1.5  1.   1.   2.5  1.   1.   2.5]




ValueError: cannot convert float NaN to integer

# Method D 
## Non-negative Matrix Factorization (NMF) with the ratings row (more weight on errors caused by row 0)


In [101]:
# Regularized Matrix Factorization


data_matrix = np.append(ground_truth_ratings,tfidf_tweet_matrix,axis=0)
data_matrix = np.append(data_matrix,tfidf_influencer_matrix,axis=0)
data_matrix = np.append(data_matrix,tfidf_description_matrix,axis=0)

R = data_matrix
 
N = len(R)
M = len(R[0])
K = 2
 
P = np.random.rand(N,K)
Q = np.random.rand(M,K)
 
nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

predictions =np.round(nR[0].reshape(1,len(nR[0])))

print("Ground Truth:")
print(ground_truth)
print("Predictions:")
print(np.around(nR[0]))


ShowMatrix(ground_truth,0,5,'original')
ShowMatrix(predictions,0,5,'original')


#ShowMatrix(tweet_matrix,np.min(tweet_matrix[1:]),np.max(tweet_matrix[1:]),'original')
#ShowMatrix(nR,np.min(tweet_matrix[1:]),np.max(tweet_matrix[1:]),'estimate')


ValueError: all the input arrays must have same number of dimensions

# Method E 
## Non-negative Matrix Factorization (NMF) with the ratings row

In [102]:
# Matrix Factorization using multiplicative update


print(tfidf_tweet_matrix.shape[1])
print(ground_truth_ratings)

data_matrix = np.append(ground_truth_ratings,tfidf_tweet_matrix,axis=0)
data_matrix = np.append(data_matrix,tfidf_influencer_matrix,axis=0)
data_matrix = np.append(data_matrix,tfidf_description_matrix,axis=0)

#Rank
R = 1

# Data
Nr =  data_matrix.shape[0]
Nc =  data_matrix.shape[1]

# Initialize W and H with random numbers
W = np.random.rand(Nr, R)*100
H = np.random.rand(R, Nc)*100

Mask = np.ones_like(data_matrix)
Mask[np.isnan(data_matrix)] = 0

W,H = nmf_kl_multiplicative(data_matrix, Mask, W, H, EPOCH=10)
Xhat = W.dot(H)

predictions = Xhat[0]
print(predictions) 

predictions = np.array(predictions)

print("Ground Truth:")
print(ground_truth_ratings)

print(predictions)

#ShowMatrix(tweet_matrix,np.min(tweet_matrix[1:]),np.max(tweet_matrix[1:]),'original')
#ShowMatrix(Xhat,np.min(tweet_matrix[1:]),np.max(tweet_matrix[1:]),'estimate')


20
[ 1.   0.5  3.   2.   2.5  5.   5.   5.   1.5  2.5  3.   4.   3.5  1.5  1.
  1.   2.5  1.   1.   2.5  1.   0.5  3.   2.   2.5  5.   5.   5.   1.5  2.5
  3.   4.   3.5  1.5  1.   1.   2.5  1.   1.   2.5]


ValueError: all the input arrays must have same number of dimensions

# Method F
## Non-negative Matrix Factorization (NMF) with the ratings row (Binary Version)

In [103]:
# Matrix Factorization using multiplicative update

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

ground_truth_binary_ratings = ground_truth_ratings.copy()
ground_truth_binary_ratings[ground_truth_binary_ratings<2]=0
ground_truth_binary_ratings[ground_truth_binary_ratings>=2.5]

data_matrix = np.append(ground_truth_binary_ratings,binary_tweet_matrix,axis=0)
data_matrix = np.append(data_matrix,binary_influencer_matrix,axis=0)
data_matrix = np.append(data_matrix,binary_description_matrix,axis=0)

#Rank
R = 1

# Data
Nr =  data_matrix.shape[0]
Nc =  data_matrix.shape[1]

# Initialize W and H with random numbers
W = np.random.rand(Nr, R)*100
H = np.random.rand(R, Nc)*100

Mask = np.ones_like(data_matrix)
Mask[np.isnan(data_matrix)] = 0

W,H = nmf_kl_multiplicative(data_matrix, Mask, W, H, EPOCH=10)
Xhat = W.dot(H)

predictions = Xhat[0]
predictions = [sigmoid(prediction) for prediction in predictions]
predictions = np.array(predictions)
print(predictions) 

predictions = np.array(predictions)

print("Ground Truth:")
print(ground_truth_ratings)

print(predictions)

#ShowMatrix(tweet_matrix,np.min(tweet_matrix[1:]),np.max(tweet_matrix[1:]),'original')
#ShowMatrix(Xhat,np.min(tweet_matrix[1:]),np.max(tweet_matrix[1:]),'estimate')

ValueError: all the input arrays must have same number of dimensions