In [48]:
# Import Dependencies
import re
import spacy
from spacy.attrs import ORTH, LEMMA, NORM, TAG
import pandas as pd
import collections
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [49]:
def replace_emoticons(string):
    """Replace emoticons with positive or negative words"""

    
    # Define emoticons to be replaced
    emoticons ={'Good': [':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)',\
                         ':}', ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D',\
                         '=D', '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P',\
                         ':P', 'X-P','x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)',\
                         '>;)', '>:-)', '<3'],
                'Bad': [':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',\
                        ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',\
                        ':c', ':{', '>:\\', ';(']}
    
    # If a string in a tweet is an emoticon, replace that emoticon with positive/negative word
    for emoticon_key, emoticon_val in emoticons.items():
        if string in emoticon_val:
            string = emoticon_key
            break
        
    return(string)

In [50]:
def clean_text(string):
    """Cleans given string from tweet to prepare for using in machine learning model"""
    

    # Replace emoticons
    string = replace_emoticons(string)
    # Replace emojis
    string = re.sub(r'[^\x00-\x7F]+','', string)
    # Remove HTML special entities
    string = re.sub(r"\&\w*;"," ", string)
    # Remove hyperlinks
    string = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))\S+",\
                    "", string)
    # Remove twitter usernames
    string = re.sub(r"@[^\s]+","", string)
    # Remove numbers
    string = re.sub("\d+", "", string)
    # Remove special characters
    string = re.sub(r"[^\w\s]", " ", string)
    string = re.sub(r"\_", " ", string)
    # Remove 1 letter words
    string = re.sub(r"\W*\b\w\b", "", string)
    # Remove leftover whitespace
    if string:
        string = " ".join(string.split())
    # Make lowercase
    string = string.lower()
    
    return(string)

In [64]:
def clean_tweet(tweet):
    """Lemmatizes tweet and replaces stop words"""
    
    # Declare global variable 
    global wordcount
    
    # Add customized stop words
    nlp.Defaults.stop_words |= {"-PRON-","joe", "biden", "bernie","sanders", "elizabeth", "warren", \
                                "kamala", "harris", "s", "ve", "twitter", "tweet", "come", "year", "know"}
    
    # Clean tweet
    tweet = clean_text(tweet)
    
    # Create empty list for cleaned text
    text = []
    
    # Lemmatize tweet
    doc = nlp(tweet)
    for token in doc:
        string = token.lemma_
        # Verify not a stop word
        if string not in nlp.Defaults.stop_words:
            text.append(string)
            if string not in wordcount:
                wordcount[string] = 1
            else:
                wordcount[string] += 1

    # If no text is left, return null; otherwise, return cleaned tweet as single string
    if not text:
        return(None)
    else:
        return(' '.join(text))

In [76]:
username = "@JoeBiden"

In [78]:
twitter_df = pd.read_csv(f"data/{username}.csv", error_bad_lines=False)
twitter_df = twitter_df[:1000]
twitter_df["tweet"] = twitter_df["tweet"].map(lambda x: clean_tweet(x))

In [52]:
# Import spacy nlp library
nlp = spacy.load('en_core_web_sm',parser=False, entity=False) 

In [67]:
# Import training dataset to test cleaning script on, and select random sample
df = pd.read_csv("Resources/sent_analysis_dataset.csv", error_bad_lines=False)
df = df[:1000]

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [68]:
wordcount = {}
df["clean"] = df["SentimentText"].map(lambda x: clean_tweet(x))

In [69]:
df.to_csv("Test1.csv")

In [79]:
# Print 50 most common words
word_counter = collections.Counter(wordcount)
for word, count in word_counter.most_common(100):
    print(word, ": ", count)

trump :  161
like :  119
want :  118
know :  101
good :  93
day :  91
think :  87
people :  81
tonight :  80
need :  79
time :  78
win :  74
don :  71
night :  69
debate :  68
vote :  67
candidate :  67
come :  66
love :  61
work :  59
right :  56
feel :  51
year :  51
bad :  48
democrat :  48
tell :  47
plan :  45
president :  45
great :  43
let :  43
miss :  42
thank :  42
lose :  41
look :  41
dem :  41
talk :  40
lol :  39
state :  39
way :  38
hope :  37
pay :  36
today :  35
oh :  35
black :  35
new :  34
thing :  34
try :  33
country :  33
racist :  32
demdebate :  32
mean :  31
away :  31
support :  31
beat :  30
big :  30
american :  30
election :  30
tomorrow :  29
stop :  29
america :  29
obama :  29
leave :  28
home :  28
wait :  28
run :  28
sad :  27
man :  27
start :  27
guy :  27
sure :  26
find :  26
help :  26
believe :  26
ya :  26
lot :  26
care :  26
policy :  26
healthcare :  26
head :  25
money :  25
speak :  25
friend :  24
follow :  24
doesn :  24
maybe :  24
u