In [60]:
from IPython.display import Markdown, display # for markdown text
import json # for json methods
import pprint # to print human readable dictionary
import pandas as pd # for visualizations
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer # for TF-IDF

In [61]:
def printmd(string):
    display(Markdown(string))

In [62]:
# Computes the TF-IDF values for the given corpus.
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 0, stop_words = 'english');
def get_tfidf(corpus):
    return tf.fit_transform(corpus.copy()).todense();

# DATA PREPARATION

## All the topics in our database

In [69]:
topics = json.load(open('topics.txt'))
pprint.pprint(topics)

{'16': 'openhardware',
 '18': 'Data Science',
 '19': 'Big Data',
 '20': 'Artificial Intelligence',
 '21': 'Business Intelligence',
 '31': 'arduino',
 '32': 'raspberry pi',
 '33': '3d printer',
 '36': 'Deep Learning',
 '37': 'IoT',
 '38': '3d printing',
 '39': 'open hardware',
 '56': 'Wearable',
 '57': 'Sustainable finance',
 '59': 'Sustainable Finance',
 '60': 'Climate Finance',
 '61': 'Green Bonds',
 '62': 'Green Economy'}


## The parameters in our scenario
We are observing an audience, which is defined by two constraints: a topic and a location. Our example is the audience in Italy interested in the topic: Arduino.

In [70]:
TOPIC_ID=31 # topic = arduino
LOCATION = 'italy'
SIGNAL_STRENGTH = 0 # this value indicates the min number of influencers the retrieved audience members follow within the topic
LIMIT = 20 # number of audience members to consider
TESTING_SET_SIZE=5 
HOW_MANY_TWEETS = 50 # amount of most recent tweets (including retweets) to be retrieved to consider in our recommendation engine
INCLUDE_RETWEETS = True

In [71]:
rated_audience_dict = json.load(open('rated_audience.txt'))
unrated_audience_dict = json.load(open('unrated_audience.txt'))

In [72]:
printmd("## An example Twitter profile with all the data fields at this point.")
printmd("### Topic: " + topics[str(TOPIC_ID)])
pprint.pprint(next (iter (rated_audience_dict.values())))

## An example Twitter profile with all the data fields at this point.

### Topic: arduino

{'description': 'Chief Innovation Officer at NTT DATA Italia , proud father of '
                'two beautiful girls, a professor, a technology evangelist, an '
                'holistic thinker and a gentleman.',
 'ground_truth_rating': 1.0,
 'hashtags': 'Milano CheTempoFa Milano CheTempoFa foi13 Cefriel fvw2013 '
             'fvw2013 fvw2013 storytelling Vajont Milano CheTempoFa '
             'StartupWeekend GrandC4Picasso makerfairerome MakerFaireRome '
             'GrandC4Picasso GrandC4Picasso',
 'influencers': '266400754 84094835 767285',
 'location': 'Milan, Italy',
 'screen_name': 'funkysurfer',
 'tweets': 'at Cascina Matiot Disturbi di Con Edi Touch è più facile il tuo '
           'quello via Startup weekend 3 giorni al A Milano Decoded Milano '
           'capitale della via October 2013 Rain Massima Minima La nuova '
           'scienza è una narrazione aperta grazie ad Questo progetto nasce '
           'per via Milano handmade col Craft Camp di a Pisa ItCup A Roncade 

## Separate the data into different arrays

In [73]:
screen_names = [aud['screen_name'] for aud in rated_audience_dict.values()]
influencers_corpus = [aud['influencers'] for aud in rated_audience_dict.values()]
tweets_corpus = [aud['tweets'] for aud in rated_audience_dict.values()]
hashtags_corpus = [aud['hashtags'] for aud in rated_audience_dict.values()]
description_corpus = [aud['description'] for aud in rated_audience_dict.values()]
ground_truth_ratings = np.array(2*[aud['ground_truth_rating'] for aud in rated_audience_dict.values()])
#print(tweets_corpus)

In [74]:
# INFLUENCER MATRICES
tfidf_influencer_matrix = np.array(get_tfidf(influencers_corpus)).T;
binary_influencer_matrix = tfidf_influencer_matrix.copy()
binary_influencer_matrix[binary_influencer_matrix>0]=1

# TWEET MATRICES
tfidf_tweet_matrix=np.array(get_tfidf(tweets_corpus)).T;
binary_tweet_matrix = tfidf_tweet_matrix.copy()
binary_tweet_matrix[binary_tweet_matrix>0]=1

# DESCRIPTION MATRICES
tfidf_description_matrix = np.array(get_tfidf(description_corpus)).T;
binary_description_matrix = tfidf_description_matrix.copy()
binary_description_matrix[binary_description_matrix>0]=1

# HASHTAGS MATRICES
tfidf_hashtags_matrix = np.array(get_tfidf(hashtags_corpus)).T;
binary_hashtags_matrix = tfidf_hashtags_matrix.copy()
binary_hashtags_matrix[binary_hashtags_matrix>0]=1

# 

# Method A
## Logistic Regression using the similarities coming from user profiling (fixed parameters)

# Method B
## Logistic Regression using the similarities coming from user profiling (relaxed parameters)

# Method C 
## Non-negative Matrix Factorization (NMF) without the ratings row = Document Clustering

# Method D 
## Non-negative Matrix Factorization (NMF) with the ratings row (more weight on errors caused by row 0)

# Method E 
## Non-negative Matrix Factorization (NMF) with the ratings row

In [None]:
# Method 1
## Logistic Regression using the similarities coming from user profiling