In [1]:
#import the necessary libraries
from string import punctuation
import timeit
import re
import logging
import os
import codecs
# import necessary NLTK packages
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
#import custom libraries
from MongoConnector import MongoConnector
from PyContract import PyContract

In [2]:
OUTPUT_DIRECTORY = '../output'
TV_SHOW = 'NCISNOLA'

In [5]:
config = {  'MONGO_COLL': 'NCISNOLA',
            'MONGO_DB': 'swati_dataset',
            'MONGO_HOST': 'localhost',
            'MONGO_PORT': 27017}

In [3]:
customWords = ['bc', 'http', 'https', 'co', 'com','rt', 'one', 'us', 'new', 
              'lol', 'may', 'get', 'want', 'like', 'love', 'no', 'thank', 'would', 'thanks',
              'good', 'much', 'low', 'roger', 'im']
alphabets = list(map(chr, range(97, 123)))
myStopWords = set(stopwords.words('english') + list(punctuation) + customWords + alphabets)

In [6]:
# Initialize dbconnector, contracters, tokenizers, lemmatizers
dbconnector = MongoConnector(config)
contracter = PyContract()
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
lemmatizer = WordNetLemmatizer()

In [21]:
def get_Tweets(user:str) -> list:
    '''
        This function takes the config file and connects to MongoDB collection.
        Retrieves the tweet list from the user id and returns a dict object

        Output: {'user_id' : [tweet_list]}
    '''
    # Create new mongo collection and cursor object to store the unprocessed raw feature corpus
    cursor = dbconnector.__connect__()
    # Collect all the user tweets as one document and store it in a list
    que = cursor.find({'user.id_str':user, 'lang':'en'}, {'_id':0, 'text':1})
    if que.count() < 10:
        return None
    tweet_list = list()
    for tweet in que:
        tweet_list.append(contracter.__translate__(tweet['text']))
    return tweet_list

In [8]:
def preprocess_Tweets(tweet_list:list) -> list:
    # Pre-process step 1 - Word Tokenization
    
    # 1. Word Tokenization
    words = list(tokenizer.tokenize(tweets) for tweets in tweet_list)
    print(words[:5])

    # 2. Remove the stop words from the document
    words_steps2 = list()
    for tweet in words: 
        sents = list(re.sub(r'\W+', '', word) for word in tweet)
        sents = filter(lambda s: not str(s).lstrip('-').isdigit(), sents)   
        sents = list(word for word in sents if word not in myStopWords and word!='' and 
                                                                    not word.startswith('http'))
        if sents!= None:
            words_steps2.append(sents)
    print(words_steps2[:5])

    # Pre-process step3 - Lemmatization
    pre_processed_list = list()
    for tweet in words_steps2:
        words_step4 = list()
        words_step3 = pos_tag(tweet)
        for token in words_step3:
            pos = get_wordnet_pos(token[1])
            # if verb, noun, adj or adverb include them after lemmatization
            if pos is not None and len(token[0]) > 3:
                try:
                    tok = lemmatizer.lemmatize(token[0], pos)
                    words_step4.append(tok)              
                except UnicodeDecodeError:
                    pass
        if(words_step4 != [] and words_step4!='\n'): 
            pre_processed_list.append(" ".join(words_step4))
        else:
            continue
    print(pre_processed_list[:5])
    return pre_processed_list

In [9]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return None

In [10]:
def get_users():
    cursor = dbconnector.__connect__()
    # Collect all the user tweets as one document and store it in a list
    que = cursor.distinct('user.id_str')
    unique_users_list = list(que)
    return unique_users_list
    
    
def ensure_directory():
    if not os.path.exists(OUTPUT_DIRECTORY):
        os.makedirs(OUTPUT_DIRECTORY)

In [22]:
if __name__ == '__main__':

    # Make sure directory is present. If not create the directory - 'output'
    with codecs.open(OUTPUT_DIRECTORY+'/'+TV_SHOW+"_raw_tweets_with_userid.csv", 'w','utf-8') as raw_u_file, \
           codecs.open(OUTPUT_DIRECTORY+'/'+TV_SHOW+"_preprocessed_tweets_corpora.csv", 'w','utf-8') as preproc_file, \
             codecs.open(OUTPUT_DIRECTORY+'/'+TV_SHOW+"_preprocessed_tweets_with_userid.csv", 'w','utf-8') as preproc_u_file, \
               codecs.open(OUTPUT_DIRECTORY+'/'+TV_SHOW+"_preprocessed_tweets_1by1__with_userid.csv", 'w','utf-8') as preproc_line_file, \
                 codecs.open(OUTPUT_DIRECTORY+'/'+TV_SHOW+"_discarded_users_list.csv", 'w','utf-8') as discarded_users_file:

        # Load the unique users from the file into a list given by unique_users_list
        print("Importing the unique users list.....")
        unique_users_list = get_users()
        discarded_users_list = list()
        print("Successfully imported {0} unique users....\n".format(len(unique_users_list)))
     
        # Get tweets for each unique user
        counter = 1
        total_tweets = 0
        total_pre_processed_tweets = 0
        for user in unique_users_list:
            user_start_time = timeit.default_timer()
    
            tweet_list = get_Tweets(user)
            print(tweet_list[:5])
            if tweet_list is None:
                print.debug("Discarded userid : {}\n".format(user))
                discarded_users_list.append(user)
                counter+=1
                continue
            processed_tweet_list = preprocess_Tweets(tweet_list)
            print("----> Pre-processing complete...")
            

            for tweet in processed_tweet_list:
                print("{0}|{1}".format(user, tweet))
                print("{0}".format(tweet))      
            user_end_time = timeit.default_timer()
 
            total_tweets += len(tweet_list)
            total_pre_processed_tweets += len(processed_tweet_list)
            print('{0}. Pre-processed tweets for userid: {1}'.format(counter, user))
            counter+=1
            break

###########     MAIN  PROGRAM ENDS HERE    ##########ß

Importing the unique users list.....
Successfully imported 6325 unique users....

['rt @ncisneworleans: .@thelucasblack explains why you need to watch #ncisnola tonight!\nhttps://t.co/bzegf4zpmy', 'rt @ncisneworleans: look who stopped by the #ncisnola truck... @hwinkler4real!! do not miss the series premiere tonight after @ncis_cbs! htt…', 'rt @thetalk_cbs: video: @scottbakula discusses the premiere of his new show, @ncisneworleans, with the ladies! http://t.co/vvzfbtffpq #thet…', 'rt @ncisneworleans: the series premiere of #ncisnola is so close! use this translator while watching the show. http://t.co/ps7hbgkngn', 'rt @ncisneworleans: the series premiere of #ncisnola starts in 10 minutes on cbs! time to get to work. sneak peek: http://t.co/x66168lxam']
[['rt', ':', '.', 'explains', 'why', 'you', 'need', 'to', 'watch', '#ncisnola', 'tonight', '!', 'https://t.co/bzegf4zpmy'], ['rt', ':', 'look', 'who', 'stopped', 'by', 'the', '#ncisnola', 'truck', '...', '!', '!', 'do', 'not', 'miss', '

  if sys.path[0] == '':
