# Twitter Scraper

### Package imports

In [1]:
import json, re, os, collections, string, tweepy, pymongo, nltk
from nltk import word_tokenize, sent_tokenize, pos_tag, ne_chunk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import treebank
from nltk.tree import Tree
from pymongo import MongoClient
%run secrets.py

### NLTK Downloads

In [2]:
nltk.download([
    'punkt',
    'averaged_perceptron_tagger',
    'stopwords',
    'wordnet',
    'maxent_ne_chunker',
    'words'
])

[nltk_data] Downloading package punkt to /Users/tomewing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tomewing/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomewing/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tomewing/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/tomewing/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/tomewing/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Parameters

In [3]:
# High level parameters

user = 'realDonaldTrump'   # Twitter user
number_of_tweets = 3000      # Number of tweets to retrieve

# Data cleaning & structure parameters

remove_words = stopwords.words('english')                    # Removing stopwords (e.g. if, then, a, and etc.) 
remove_punc = RegexpTokenizer(r'\w+')                        # Removing punctuation from a string (note will also remove @ and #)
remove_numbers = str.maketrans(dict.fromkeys('0123456789'))  # Removing numbers
remove_emojis = emoji_pattern = re.compile("["               # Removing emojis
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
"]+", flags=re.UNICODE)

# NER corpus

ner_corpus_root = "ner_corpus/gmb-2.2.0"

## Functions

### twitter_authenticate function

In [4]:
def twitter_authenticate():
    '''
    Authenticates to the Twitter API and returns a cursor
    '''
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)
    return api

### extract_data function

In [5]:
def extract_data(user, number_of_tweets):
    '''
    Extracts the specified number of tweets from the specified user and returns two list objects:
        raw_tweets_output: The raw tweet data including metadata
        raw_tweets_text: Just the text from the tweet
    '''
    
    # Create the cursor
    api = twitter_authenticate()  

    # Retrieve user tweets
    raw_tweets_input = api.user_timeline(
        screen_name = user, 
        tweet_mode = 'extended', 
        count = number_of_tweets
    )

    # Extract relevent data from DJT's tweets & save to a data structure

    raw_tweets_output = []

    for tweet in raw_tweets_input:
        output_dict = {}
        output_dict['id'] = tweet._json['id_str']
        output_dict['text'] = tweet._json['full_text']
        output_dict['created'] = tweet._json['created_at']
        output_dict['favorite_count'] = tweet._json['favorite_count']
        output_dict['retweet_count'] = tweet._json['retweet_count']
        output_dict['retweeted'] = tweet._json['retweeted']
        output_dict['entities'] = tweet._json['entities']
        raw_tweets_output.append(output_dict)

    # Create a data structure for the text only

    raw_tweets_text = [tweet['text'] for tweet in raw_tweets_output]
    
    return raw_tweets_output, raw_tweets_text

### clean_data function

In [6]:
def clean_data():
    '''
    Takes the raw tweets list as an input and cleans the data removing hyperlinks, hashtags, emojis,
    converting to lower case, removing numbers, stopwords, punctuation and then lemmatizes the individual words
    for consistency before tokenizing into separate datastructures for tweets sentences and words. 
    
    Returns five list objects as follows:
    
        clean_words: Cleaned tweets tokenized into words.
        clean_sentences: Cleaned tweets tokenized into sentences.
        clean_tweets: Cleaned tweets.      
    
    '''
    
    # Creating the data structure(s)
    
    raw_tweets_output, raw_tweets_text = extract_data(user,number_of_tweets)
    
    # Building the output data structures

    clean_words = []
    clean_sentences = []
    clean_tweets = []

    
    # Iterate through the tweets, clean and append data to output data structures

    for tweet in raw_tweets_text:
        tweet = re.sub(r"http\S+", "", tweet)                    # Remove hyperlinks
        tweet = emoji_pattern.sub(r'', tweet)                    # Remove emojis

        # Converting to lower case and removing numbers and stopwords
        
        words = [
            i for i in word_tokenize(
                tweet                                            # Splits text into individual words
                 #.lower()                                        # Converts to lower case - need to move this as it messes with the POS
                .translate(remove_numbers)                       # Removes numbers
            )
            if i not in remove_words                             # Removes stopwords
        ]
        
        # Lemmatize the individual words - need to move this as it messes with POS
        
        #lemmatizer = WordNetLemmatizer()                         # Initialising the lemmatizer
        #[lemmatizer.lemmatize(word) for word in words] 
        
        # Rebuilding the clean tweet and tokenising into sentences and words

        tweet = ' '.join(words)                                  # Joins the words back into a tweet
        sentences = sent_tokenize(tweet)                         # Splits the tweet into a sentence
        
        output_words = remove_punc.tokenize(tweet)               # Removes punctuation from the tweet and outputs as words   
        output_sentences = remove_punc.tokenize_sents(sentences) # Removes punctuation from the sentences and outputs as sentences
        output_tweet = re.sub(r'\W+', ' ', tweet)                # Removes punctuation from the tweet an outputs
        
        # Appending the clean data to the output data structures
        
        clean_words.append(output_words)
        clean_sentences.append(output_sentences)
        clean_tweets.append(output_tweet)
        
    return clean_words, clean_sentences, clean_tweets

### extract_entities function

In [7]:
def extract_entities():
    '''
    Extracts entities from the tweets (need more info here!) and returns a list of IOB tagged trees
    '''
    # Pull the base data
    
    clean_words, clean_sentences, clean_tweets = clean_data()   
    
    # Tag, chunk and IOB tag the data
    
    tagged_tweets = [pos_tag(word_tokenize(clean_tweet)) for clean_tweet in clean_tweets]  # POS tagging
    ne_trees = [ne_chunk(tagged_tweet) for tagged_tweet in tagged_tweets]                  # Chunking
    iob_tagged_trees = [tree2conlltags(ne_tree) for ne_tree in ne_trees]                   # IOB Tagging
    
    return iob_tagged_trees

### 

In [100]:
ner_tags = collections.Counter()
 
for root, dirs, files in os.walk(ner_corpus_root):
    for filename in files:
        if filename.endswith(".tags"):
            with open(os.path.join(root, filename), 'rb') as file_handle:
                file_content = file_handle.read().decode('utf-8').strip()
                annotated_sentences = file_content.split('\n\n')   # Split sentences
                for annotated_sentence in annotated_sentences:
                    annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]  # Split words
                    
                    standard_form_tokens = []

                    for idx, annotated_token in enumerate(annotated_tokens):
                        annotations = annotated_token.split('\t')   # Split annotation
                        word, tag, ner = annotations[0], annotations[1], annotations[3]

                        # Get only the primary category
                        if ner != 'O':
                            ner = ner.split('-')[0]

                        ner_tags[ner] += 1

### Execution

In [8]:
extract_entities()

[[('All', 'DT', 'O'),
  ('across', 'IN', 'O'),
  ('nation', 'NN', 'O'),
  ('pray', 'NN', 'O'),
  ('country', 'NN', 'O'),
  ('THANK', 'NNP', 'B-ORGANIZATION'),
  ('GOD', 'NNP', 'O'),
  ('United', 'NNP', 'B-GPE'),
  ('States', 'NNPS', 'I-GPE'),
  ('Marines', 'NNP', 'B-PERSON'),
  ('Thank', 'NNP', 'I-PERSON'),
  ('God', 'NNP', 'I-PERSON'),
  ('Bless', 'NNP', 'O'),
  ('You', 'PRP', 'O'),
  ('And', 'CC', 'O'),
  ('God', 'NNP', 'B-PERSON'),
  ('Bless', 'NNP', 'I-PERSON'),
  ('America', 'NNP', 'I-PERSON')],
 [('It', 'PRP', 'O'),
  ('great', 'JJ', 'O'),
  ('honor', 'JJ', 'O'),
  ('deliver', 'NN', 'O'),
  ('message', 'NN', 'O'),
  ('Marine', 'NNP', 'B-PERSON'),
  ('Corps', 'NNP', 'I-PERSON'),
  ('Air', 'NNP', 'I-PERSON'),
  ('Station', 'NNP', 'O'),
  ('Miramar', 'NNP', 'B-PERSON'),
  ('GREAT', 'NNP', 'I-PERSON'),
  ('U', 'NNP', 'O'),
  ('S', 'NNP', 'O'),
  ('Military', 'NNP', 'O'),
  ('straight', 'VBD', 'O'),
  ('heart', 'NN', 'O'),
  ('American', 'JJ', 'B-ORGANIZATION'),
  ('People', 'NNP', 'O