In [4]:
#import the necessary libraries
import nltk
import json
from pprint import pprint

#import custom libraries
from MongoConnector import MongoConnector

In [5]:
# Load the config dictionary object from the db_config.json file
with open("../db_config.json",'r') as f:
    config = json.load(f)
# print the config file
pprint(config)

{'MONGO_COLL': 'historical_tweets2',
 'MONGO_DB': 'tweetCorpus',
 'MONGO_HOST': 'localhost',
 'MONGO_PORT': 27017}


In [6]:
# Create a cursor to connect to MongoDB 
cursor = MongoConnector(config).__connect__()

In [7]:
# Get a document from the mongodb collection
documents = cursor.find({}).limit(1)
pprint(list(documents))

[{'_id': ObjectId('5a34041fbe437712b8efd3ba'),
  'contributors': None,
  'coordinates': None,
  'created_at': 'Fri Dec 15 02:38:42 +0000 2017',
  'entities': {'hashtags': [],
               'symbols': [],
               'urls': [],
               'user_mentions': [{'id': 15290441,
                                  'id_str': '15290441',
                                  'indices': [0, 12],
                                  'name': 'WWE Universe',
                                  'screen_name': 'WWEUniverse'},
                                 {'id': 17861062,
                                  'id_str': '17861062',
                                  'indices': [13, 29],
                                  'name': 'GUNNA',
                                  'screen_name': 'machinegunkelly'}]},
  'favorite_count': 0,
  'favorited': False,
  'geo': None,
  'id': 941497715590160384,
  'id_str': '941497715590160384',
  'in_reply_to_screen_name': 'WWEUniverse',
  'in_reply_to_status_id': 941496869

In [8]:
# Get the unique users list from the collection specified by the attribute 'user.id_str'
unique_users_list = cursor.distinct('user.id_str')
# List any 5 unique users
pprint(list(unique_users_list)[:6])

['324987904',
 '890212929064366080',
 '750108724136701953',
 '2374706366',
 '70316660',
 '955382688']


In [9]:
# print the total number of unique users
pprint(len(unique_users_list))

2207


In [10]:
# Save the unique_users_list in the file - unique_users.txt
OUTPUT_DIRECTORY = '../output'
import os
import codecs
if not os.path.exists(OUTPUT_DIRECTORY):
    os.makedirs(OUTPUT_DIRECTORY)
with codecs.open(OUTPUT_DIRECTORY+"/unique_users.txt", 'w','utf-8') as outfile:
    for user in unique_users_list:
        outfile.write(str(user+'\n'))
        

In [16]:
# Load the unique users from the file into a list given by unique_users_list
unique_users_list = []
with open("../output/unique_users.txt",'r', encoding='utf-8') as outfile:
        unique_users_list = outfile.read().splitlines()
pprint(len(unique_users_list))

2207


In [116]:
# Collect the tweets of the unique users and save them in a dictionary
# Store the user info (id_str, screen_name) , tweet info (tweet_id, tweet, tweet_status, truncated),
# user mention info(mentions, id_str, names, screen-names), favorite info (favorited, favorite_count),
# retweet info (retweeted, retweet_count), reply (replyreply_id, reply_count), quote (quote_status, quote_list)

# Create new mongo collection and cursor object to store the unprocessed raw feature corpus
config1 = {  'MONGO_COLL': 'raw1Corpus',
             'MONGO_DB': 'tweetCorpus',
             'MONGO_HOST': 'localhost',
             'MONGO_PORT': 27017}
cursor2 = MongoConnector(config1).__connect__()

import sys
MAX_TWEET_LIMIT = 3000
# Create the structure of the dictionary
tweet_dict = dict()
user_visible_list = ['user.id_str',
                     'user.name', 
                     'user.screen_name',
                     'user.created_at',
                     'user.description',
                     'user.derived',
                     'user.protected',
                     'user.verified',
                     'user.followers_count',
                     'user.friends_count',
                     'user.listed_count',
                     'user.favourites_count',
                     'user.statuses_count',
                     'user.contributors_enabled']
tweet_visible_list = [  'id_str',
                        'created_at',
                        'text',
                        'tweet_status',
                        'truncated',
                        'entities.user_mentions',
                        'favorited',
                        'favorite_count',
                        'retweeted',
                        'retweet_count',
                        'in_reply_to_screen_name',
                        'in_reply_to_status_id_str',
                        'in_reply_to_user_id',
                        'reply_count',
                        'is_quote_status',
                        'quote_count']
user_projection = {attribute : 1 for attribute in user_visible_list}
tweet_projection = {attribute: 1 for attribute in tweet_visible_list}

discarded_users_list = list()
counter = 0

for user in unique_users_list[0:1]:
    tweet_list = list()
    query = {'user.id_str' : user,  'lang': 'en'}
    user_docs = cursor.find_one(query, user_projection)
    tweet_docs = cursor.find(query, tweet_projection).limit(MAX_TWEET_LIMIT)
    
    pprint(len(list(tweet_docs)))
    # Discard users whose tweets < 10
    if len(list(tweet_docs)) < 10:
        discarded_users_list.append(user)
        print('discarding userid : %s...' %(user))
        continue
        
    tweet_dict['doc'] = {
                           'user_info': user_docs['user'],
                           'tweets': list(tweet_docs)
                        }

    pprint(list(len(tweet_docs)))
    cursor2.insert_many(tweet_dict)
    tweet_dict.clear()        
    counter+=1
    print('Storing done for %d user with id: %s' %(counter, user))
    

2714
discarding userid : 324987904...


In [106]:
# Create Indexes for our mongodb data base for the fields - user_indo.id_str and tweets.id_str
import pymongo
cursor2.create_index([("doc.user_info.id_str",pymongo.ASCENDING)], unique=True)


'doc.user_info.id_str_1'

In [101]:
pprint(len(list(set(unique_users_list) - set(discarded_users_list))))

2167


In [102]:
pprint(len(list(set(unique_users_list) - set(discarded_users_list))))


2167


In [105]:
# Write the userid of the unique users with tweets > 10
with codecs.open(OUTPUT_DIRECTORY+"/final_unique_users.txt", 'w','utf-8') as outfile:
    for user in list(set(unique_users_list) - set(discarded_users_list)):
        outfile.write(str(user+'\n'))

# Write the user id of the discarded users ie. users whose tweets<=10
with codecs.open(OUTPUT_DIRECTORY+"/discarded_unique_users.txt", 'w','utf-8') as outfile:
    for user in discarded_users_list:
        outfile.write(str(user+'\n'))