In [1]:
# Import dependencis
import pandas as pd
import os,glob
import csv

# Hides warning
import warnings
warnings.filterwarnings('ignore') 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)

# Text Processing libraries
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet,stopwords

In [2]:
# File paths
tweet_data_file = os.path.join("..","Resources","inputData")
cleaned_tweet_file=os.path.join("..","..","Resources","outputData","tweetCleandata.csv")
extension = 'csv'

In [3]:
# Change directory to read files
os.chdir(tweet_data_file)

In [4]:
# Read all csv files
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [5]:
# List of files bing processed
all_filenames

['tweetAPISearch_10July.csv',
 'tweetAPISearch_11July.csv',
 'tweetAPISearch_12July.csv',
 'tweetAPISearch_13July.csv',
 'tweetAPISearch_14July.csv',
 'tweetAPISearch_4July.csv',
 'tweetAPISearch_5July.csv',
 'tweetAPISearch_6July.csv',
 'tweetAPISearch_7July.csv',
 'tweetAPISearch_8July.csv',
 'tweetAPISearch_9July.csv']

### Prepare Data
- Social media data is unstructured -it’s raw, noisy, and needs to be cleaned before we can start working on our sentiment analysis model. 
- Preprocessing a Twitter dataset involves a series of tasks like removing all types of irrelevant information like emojis, special characters, and extra blank spaces. It can also involve making format improvements, delete duplicate tweets, or tweets that are shorter than three characters.

In [6]:
#combine all files in the list
tweet_df = pd.concat([pd.read_csv(f) for f in all_filenames ])

In [7]:
# Check df info
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110146 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Tweet             110100 non-null  object 
 1   Matched Keywords  110100 non-null  object 
 2   Date              110100 non-null  object 
 3   User              110100 non-null  object 
 4   Source            110091 non-null  object 
 5   Tweet ID          110100 non-null  float64
 6   Tweet URL         110100 non-null  object 
 7   Followers         110100 non-null  float64
 8   Friends           110100 non-null  float64
 9   Favorite          110100 non-null  float64
dtypes: float64(4), object(6)
memory usage: 9.2+ MB


In [8]:
# Get value count for each keyword
tweet_df['Matched Keywords'].value_counts()

Trump    85486
Biden    24614
Name: Matched Keywords, dtype: int64

In [9]:
# Get Tweet ID
tweet_df['TweetID'] = tweet_df['Tweet URL'].apply(str).apply(lambda x: x.split("/")[4] if (len(x)>29) else 0 )

In [10]:
# Display data
tweet_df.head()

Unnamed: 0,Tweet,Matched Keywords,Date,User,Source,Tweet ID,Tweet URL,Followers,Friends,Favorite,TweetID
0,They're even right that there's a certain kind...,Trump,8/10/2020 12:11,MenshevikM,Twitter Web App,1.29e+18,https://twitter.com/statuses/1292795662485131264,5882.0,320.0,0.0,1292795662485131264
1,"@jonathanchait Naw, that is the press, people ...",Trump,8/10/2020 12:11,balling_it,Twitter Web App,1.29e+18,https://twitter.com/statuses/1292795661809852417,33.0,156.0,0.0,1292795661809852417
2,@sarahcpr Trump signs an executive order to th...,Trump,8/10/2020 12:11,laurie71,Twitter for iPhone,1.29e+18,https://twitter.com/statuses/1292795659704242181,85.0,141.0,0.0,1292795659704242181
3,@glennkirschner2 Sorry I want to be on a real ...,Biden,8/10/2020 12:11,bluewave4peace,Twitter for iPhone,1.29e+18,https://twitter.com/statuses/1292795658747944960,528.0,745.0,0.0,1292795658747944960
4,Yeah. Until Ben Sasse cowers and yelps befor j...,Trump,8/10/2020 12:11,OGOPer,Twitter for iPhone,1.29e+18,https://twitter.com/statuses/1292795658550812672,2839.0,2655.0,0.0,1292795658550812672


In [11]:
# Keep only required columns
df_pred = tweet_df[['Date','TweetID','Tweet','Matched Keywords','User','Source','Followers','Friends','Favorite']]

In [12]:
# Keep copy of orginal tweet
df_pred['OrgTweet']=df_pred['Tweet']
df_pred['Tweet']= df_pred['Tweet'].apply(str)

In [13]:
# Display df
df_pred.head()

Unnamed: 0,Date,TweetID,Tweet,Matched Keywords,User,Source,Followers,Friends,Favorite,OrgTweet
0,8/10/2020 12:11,1292795662485131264,They're even right that there's a certain kind...,Trump,MenshevikM,Twitter Web App,5882.0,320.0,0.0,They're even right that there's a certain kind...
1,8/10/2020 12:11,1292795661809852417,"@jonathanchait Naw, that is the press, people ...",Trump,balling_it,Twitter Web App,33.0,156.0,0.0,"@jonathanchait Naw, that is the press, people ..."
2,8/10/2020 12:11,1292795659704242181,@sarahcpr Trump signs an executive order to th...,Trump,laurie71,Twitter for iPhone,85.0,141.0,0.0,@sarahcpr Trump signs an executive order to th...
3,8/10/2020 12:11,1292795658747944960,@glennkirschner2 Sorry I want to be on a real ...,Biden,bluewave4peace,Twitter for iPhone,528.0,745.0,0.0,@glennkirschner2 Sorry I want to be on a real ...
4,8/10/2020 12:11,1292795658550812672,Yeah. Until Ben Sasse cowers and yelps befor j...,Trump,OGOPer,Twitter for iPhone,2839.0,2655.0,0.0,Yeah. Until Ben Sasse cowers and yelps befor j...


### Tweet Normalization

In [14]:
# Create a function to clean the tweets
def cleanTxt(text):
    text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
 
    return text

In [15]:
# Preprocess text in tweets by removing links, @UserNames, blank spaces, etc.
def preprocessing_text(table):
    #put everythin in lowercase
    table['Tweet'] = table['Tweet'].str.lower()
    #Replace rt indicating that was a retweet
    table['Tweet'] = table['Tweet'].replace('rt', '')
    #Replace occurences of mentioning @UserNames
    table['Tweet'] = table['Tweet'].replace(r'@\w+', '', regex=True)
    #Replace links contained in the tweet
    table['Tweet'] = table['Tweet'].replace(r'http\S+', '', regex=True)
    table['Tweet'] = table['Tweet'].replace(r'www.[^ ]+', '', regex=True)
    #remove numbers
    table['Tweet'] = table['Tweet'].replace(r'[0-9]+', '', regex=True)
    #replace special characters and puntuation marks
    table['Tweet'] = table['Tweet'].replace(r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', regex=True)
    return table    

In [16]:
# Replace elongated words by identifying those repeated characters and then remove them and compare the new word with the english lexicon
def in_dict(word):
    if wordnet.synsets(word):
        #if the word is in the dictionary, we'll return True
        return True

def replace_elongated_word(word):
    regex = r'(\w*)(\w+)\2(\w*)'
    repl = r'\1\2\3'    
    if in_dict(word):
        return word
    new_word = re.sub(regex, repl, word)
    if new_word != word:
        return replace_elongated_word(new_word)
    else:
        return new_word

def detect_elongated_words(row):
    regexrep = r'(\w*)(\w+)(\2)(\w*)'
    words = [''.join(i) for i in re.findall(regexrep, row)]
    for word in words:
        if not in_dict(word):
            row = re.sub(word, replace_elongated_word(word), row)
    return row 

In [17]:
# Remove the stop words
def stop_words(table):
    stop_words_list = stopwords.words('english')
    table['Tweet'] = table['Tweet'].str.lower()
    table['Tweet'] = table['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words_list)]))
    return table

In [18]:
# Get all the lemma for the word
def replace_antonyms(word):
    for syn in wordnet.synsets(word): 
        for lemma in syn.lemmas(): 
            #if the lemma is an antonyms of the word
            if lemma.antonyms(): 
                #we return the antonym
                return lemma.antonyms()[0].name()
    return word

#Tokenize the row
def handling_negation(row):
    
    words = word_tokenize(row)
    speach_tags = ['JJ', 'JJR', 'JJS', 'NN', 'VB', 'VBD', 'VBG', 'VBN', 'VBP']
    
    # Obtain the type of words that we have in the text, we use the pos_tag function
    tags = nltk.pos_tag(words)
    
    # Negation in the words
    tags_2 = ''
    if "n't" in words and "not" in words:
        tags_2 = tags[min(words.index("n't"), words.index("not")):]
        words_2 = words[min(words.index("n't"), words.index("not")):]
        words = words[:(min(words.index("n't"), words.index("not")))+1]
    elif "n't" in words:
        tags_2 = tags[words.index("n't"):]
        words_2 = words[words.index("n't"):] 
        words = words[:words.index("n't")+1]
    elif "not" in words:
        tags_2 = tags[words.index("not"):]
        words_2 = words[words.index("not"):]
        words = words[:words.index("not")+1] 
        
    for index, word_tag in enumerate(tags_2):
        if word_tag[1] in speach_tags:
            words = words+[replace_antonyms(word_tag[0])]+words_2[index+2:]
            break
            
    return ' '.join(words)    

In [19]:
# This function will process all the required cleaning for the text in our tweets
def cleaning_table(table):
    table = preprocessing_text(table)
    table['Tweet'] = table['Tweet'].apply(lambda x: detect_elongated_words(x))
    table['Tweet'] = table['Tweet'].apply(lambda x: handling_negation(x))
    table = stop_words(table)
    return table

In [20]:
# Clean Tweets
df_pred['Tweet']=df_pred['Tweet'].apply(cleanTxt)

In [21]:
# Removing Punctuations, Numbers, and Special Characters
df_pred['Tweet'] = df_pred['Tweet'].str.replace("[^a-zA-Z#]", " ")

In [22]:
# Removing Short Words
df_pred['Tweet'] = df_pred['Tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [23]:
# Display Sample data
df_pred.head()

Unnamed: 0,Date,TweetID,Tweet,Matched Keywords,User,Source,Followers,Friends,Favorite,OrgTweet
0,8/10/2020 12:11,1292795662485131264,They even right that there certain kind libera...,Trump,MenshevikM,Twitter Web App,5882.0,320.0,0.0,They're even right that there's a certain kind...
1,8/10/2020 12:11,1292795661809852417,that press people encouraged voters vote Trump...,Trump,balling_it,Twitter Web App,33.0,156.0,0.0,"@jonathanchait Naw, that is the press, people ..."
2,8/10/2020 12:11,1292795659704242181,Trump signs executive order throw rotted scrap...,Trump,laurie71,Twitter for iPhone,85.0,141.0,0.0,@sarahcpr Trump signs an executive order to th...
3,8/10/2020 12:11,1292795658747944960,Sorry want real team truthful team justice tea...,Biden,bluewave4peace,Twitter for iPhone,528.0,745.0,0.0,@glennkirschner2 Sorry I want to be on a real ...
4,8/10/2020 12:11,1292795658550812672,Yeah Until Sasse cowers yelps befor jumping in...,Trump,OGOPer,Twitter for iPhone,2839.0,2655.0,0.0,Yeah. Until Ben Sasse cowers and yelps befor j...


In [24]:
# Clean the tweet
tweet_table = cleaning_table(df_pred)

In [25]:
# Display results
tweet_table.head()

Unnamed: 0,Date,TweetID,Tweet,Matched Keywords,User,Source,Followers,Friends,Favorite,OrgTweet
0,8/10/2020 12:11,1292795662485131264,even right certain kind liberal deeply wants g...,Trump,MenshevikM,Twitter Web App,5882.0,320.0,0.0,They're even right that there's a certain kind...
1,8/10/2020 12:11,1292795661809852417,press people encouraged voters vote trump like...,Trump,balling_it,Twitter Web App,33.0,156.0,0.0,"@jonathanchait Naw, that is the press, people ..."
2,8/10/2020 12:11,1292795659704242181,trump signs executive order throw rotted scrap...,Trump,laurie71,Twitter for iPhone,85.0,141.0,0.0,@sarahcpr Trump signs an executive order to th...
3,8/10/2020 12:11,1292795658747944960,sorry want real team truthful team justice tea...,Biden,bluewave4peace,Twitter for iPhone,528.0,745.0,0.0,@glennkirschner2 Sorry I want to be on a real ...
4,8/10/2020 12:11,1292795658550812672,yeah sase cowers yelps befor jumping embarrass...,Trump,OGOPer,Twitter for iPhone,2839.0,2655.0,0.0,Yeah. Until Ben Sasse cowers and yelps befor j...


### Sentiment Prediction 

In [26]:
# import SentimentIntensityAnalyzer class from vaderSentiment.vaderSentiment module. 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
  
# function to print sentiments 
# of the sentence. 
def getSentiment(sentence): 
  
    # Create a SentimentIntensityAnalyzer object. 
    sid_obj = SentimentIntensityAnalyzer() 
  
    # polarity_scores method of SentimentIntensityAnalyzer oject gives a sentiment dictionary which contains pos, neg, neu, and compound scores. 
    sentiment_dict = sid_obj.polarity_scores(sentence) 
  
    # decide sentiment as positive, negative and neutral 
    if sentiment_dict['pos'] > sentiment_dict['neg'] : 
        return "Positive"
  
    elif sentiment_dict['neg'] > sentiment_dict['pos']  : 
        return "Negative"
  
    else : 
        return "Neutral"

In [27]:
#Get sentiment
tweet_table["Sentiment"] = tweet_table['Tweet'].apply(lambda x: getSentiment(x))

In [31]:
# Change date column format and export only required columns for further processing
tweet_table['Date'] = pd.to_datetime(tweet_table['Date'])
tweet_table['Date'] = tweet_table['Date'].dt.strftime('%m/%d/%Y')

In [32]:
# Display results
tweet_table.head()

Unnamed: 0,Date,TweetID,Tweet,Matched Keywords,User,Source,Followers,Friends,Favorite,OrgTweet,Sentiment
0,08/10/2020,1292795662485131264,even right certain kind liberal deeply wants g...,Trump,MenshevikM,Twitter Web App,5882.0,320.0,0.0,They're even right that there's a certain kind...,Positive
1,08/10/2020,1292795661809852417,press people encouraged voters vote trump like...,Trump,balling_it,Twitter Web App,33.0,156.0,0.0,"@jonathanchait Naw, that is the press, people ...",Positive
2,08/10/2020,1292795659704242181,trump signs executive order throw rotted scrap...,Trump,laurie71,Twitter for iPhone,85.0,141.0,0.0,@sarahcpr Trump signs an executive order to th...,Negative
3,08/10/2020,1292795658747944960,sorry want real team truthful team justice tea...,Biden,bluewave4peace,Twitter for iPhone,528.0,745.0,0.0,@glennkirschner2 Sorry I want to be on a real ...,Positive
4,08/10/2020,1292795658550812672,yeah sase cowers yelps befor jumping embarrass...,Trump,OGOPer,Twitter for iPhone,2839.0,2655.0,0.0,Yeah. Until Ben Sasse cowers and yelps befor j...,Negative


In [33]:
# Save tweet data table for charting and modelling
tweet_table.to_csv(cleaned_tweet_file,index=False)