In [7]:
import pymongo
import json
import pprint 

# Set Up a MongoDB for Tweets

In [42]:
def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    return db

def insert_many_data(data, db, collection):
    # Insert the many data into a collection 'tweets'
    db[collection].insert_many(data)

def insert_one_data(data, db, collection):
    # Insert the many data into a collection 'tweets'
    db[collection].insert_one(data)

def aggregate(db, collection, pipeline):
    return [doc for doc in db[collection].aggregate(pipeline)]

In [39]:
# get the database
db = get_db('examples')

# clean the existed collections
db.tweets.delete_many({})

with open('output.txt') as f:
    # skip the last row since it is not complete tweet
    for line in f.readlines()[:-1]:
        data = json.loads(line)
        insert_one_data(data, db, 'tweets')

In [41]:
# number of tweets 
num_tweets = db.tweets.count()
print('number of tweets:', num_tweets)

number of tweets: 22462


# Text Analysis
## Tweet Sentiment

In [62]:
def get_afinn_scores(afinnfile):
    '''
    Args:
        afinnfile (file): A file contains the English word scores, and the file
        is tab-delimited.
        
    Returns:
        dict: The dict that contains pairs of English word and the related 
        score
    '''
    # initialize an empty dictionary
    scores = {}
    with open(afinnfile, 'r') as f:
        for line in f:
            # The file is tab-delimited.
            term, score  = line.split("\t")  
            scores[term] = int(score)
    return scores

def get_sentiment(tweet_text, scores):
    '''
    Args:
        tweet_text (str): A string that contains the text in tweet.
        scores (dict): A dict of English word scores.
    Returns:
        float: The sentiment of the tweet text. 
    '''
    
    # Zero if word is not in the scores
    sentiment = sum( scores.get(word, 0) for word in tweet_text )
    
    return sentiment

In [65]:
# get the scores from the AFINN file
scores = get_afinn_scores('AFINN-111.txt')

# process the text in tweets
pipeline = [{'$match': {'text': {'$ne': None}}}, # only consider the tweet with text
            {'$project': {'text': {'$split': ['$text', ' ']}}}, # split the text and project the list of words
            ]
results = aggregate(db, 'tweets', pipeline)

# get the sentiments
sentiments = [get_sentiment(tweet['text'], scores) for tweet in results]