In [1]:
#pip install textblob
#pip install tweepy
#pip install afinn
#pip install autocorrect
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('sentiwordnet')
#pip install spacy vaderSentiment

In [2]:
# Import dependencis
import pandas as pd
import itertools  
from textblob import TextBlob
import sys, tweepy
from requests.exceptions import Timeout, ConnectionError
from requests.packages.urllib3.exceptions import ReadTimeoutError
import matplotlib.pyplot as plt
import csv
import ssl
import time
import warnings
warnings.filterwarnings('ignore') # Hides warning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)
import nltk
from nltk.corpus import stopwords
from techniques import *
from autocorrect import Speller
import os
import glob

In [3]:
# File paths
tweet_data_file = os.path.join("..","Resources","inputData")
cleaned_tweet_file=os.path.join("..","..","Resources","outputData","tweetCleandata.csv")
extension = 'csv'

In [4]:
# Change directory to read files
os.chdir(tweet_data_file)

In [5]:
# Read all csv files
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

### Prepare Data
- Social media data is unstructured -it’s raw, noisy, and needs to be cleaned before we can start working on our sentiment analysis model. 
- Preprocessing a Twitter dataset involves a series of tasks like removing all types of irrelevant information like emojis, special characters, and extra blank spaces. It can also involve making format improvements, delete duplicate tweets, or tweets that are shorter than three characters.

In [6]:
#combine all files in the list
tweet_df = pd.concat([pd.read_csv(f) for f in all_filenames ])

In [7]:
# List of files bing processed
all_filenames

['tweetAPISearch_4July.csv',
 'tweetAPISearch_5July.csv',
 'tweetAPISearch_6July.csv',
 'tweetAPISearch_7July.csv',
 'tweetAPISearch_8July.csv']

In [8]:
# Check df info
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68583 entries, 0 to 9564
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Tweet             56886 non-null  object 
 1   Matched Keywords  56886 non-null  object 
 2   Date              56886 non-null  object 
 3   User              56886 non-null  object 
 4   Source            56879 non-null  object 
 5   Tweet ID          56886 non-null  float64
 6   Tweet URL         56886 non-null  object 
 7   Followers         56886 non-null  float64
 8   Friends           56886 non-null  float64
 9   Favorite          56886 non-null  float64
dtypes: float64(4), object(6)
memory usage: 5.8+ MB


In [9]:
# Get value count for each keyword
tweet_df['Matched Keywords'].value_counts()

Trump    47262
Biden     9624
Name: Matched Keywords, dtype: int64

In [10]:
#Lets copy column 
tweet_df['CleanedTweet'] = tweet_df['Tweet'].apply(str)

In [11]:
# Get Tweet ID from URL
tweet_df['TweetID'] = tweet_df['Tweet URL'].apply(str).apply(lambda x: x.split("/")[4] if (len(x)>29) else 0 )

In [12]:
# Change data type of TweetID
tweet_df['TweetID'] =tweet_df['TweetID'].apply(int)

In [13]:
# Lets clean the tweet data

In [14]:
# 1. Remove unicode
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: removeUnicode(x))

In [15]:
tweet_df[['Tweet','CleanedTweet']]

Unnamed: 0,Tweet,CleanedTweet
0,Trump: TikTok Must Sell Its American Operation...,Trump: TikTok Must Sell Its American Operation...
1,@richardmarx I need whatever the reporter is t...,@richardmarx I need whatever the reporter is t...
2,@GOPChairwoman President Trump & the RNC keep ...,@GOPChairwoman President Trump & the RNC keep ...
3,"@realDonaldTrump \n""Donald Trump dumped $400 m...","@realDonaldTrump \n""Donald Trump dumped $400 m..."
4,This is how every single journalist should be ...,This is how every single journalist should be ...
...,...,...
9560,@JoeBiden We know Joe Biden love The Chinese C...,@JoeBiden We know Joe Biden love The Chinese C...
9561,@KayaJones I like Trump but this was already i...,@KayaJones I like Trump but this was already i...
9562,@JoeBiden Is there a person of sound mind in t...,@JoeBiden Is there a person of sound mind in t...
9563,Opinion | We are only beginning to suffer the ...,Opinion | We are only beginning to suffer the ...


In [16]:
#2. Replace URL, hashTag etc.
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: replaceURL(x))
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: replaceAtUser(x))
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: removeHashtagInFrontOfWord(x))

In [17]:
#3. Replace slang words and abbreviations with their equivalents
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: replaceSlang(x))

In [18]:
#4. Replace contractions to their equivalents
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: replaceContraction(x))

In [19]:
#5. Remove integers from text
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: removeNumbers(x))

In [20]:
#6. Remove emoticons from text   
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: removeEmoticons(x))

In [21]:
#7. Replace repetitions of exlamation marks,question marks and stop marks
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: replaceMultiExclamationMark(x))
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: replaceMultiQuestionMark(x))
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: replaceMultiStopMark(x))

In [22]:
tweet_df[['Tweet','CleanedTweet']]

Unnamed: 0,Tweet,CleanedTweet
0,Trump: TikTok Must Sell Its American Operation...,Trump TikTok Must Sell Its American Operations...
1,@richardmarx I need whatever the reporter is t...,atUser I need whatever the reporter is taking ...
2,@GOPChairwoman President Trump & the RNC keep ...,atUser President Trump and the RNC keep sendin...
3,"@realDonaldTrump \n""Donald Trump dumped $400 m...","atUser \n""Donald Trump dumped $ million into h..."
4,This is how every single journalist should be ...,This is how every single journalist should be ...
...,...,...
9560,@JoeBiden We know Joe Biden love The Chinese C...,atUser We know Joe Biden love The Chinese Comm...
9561,@KayaJones I like Trump but this was already i...,atUser I like Trump but this was already in Ob...
9562,@JoeBiden Is there a person of sound mind in t...,atUser Is there a person of sound mind in the ...
9563,Opinion | We are only beginning to suffer the ...,Opinion | We are only beginning to suffer the ...


In [23]:
def rmPunctuation(text):
    return re.sub(r'[^\w\s]', '', text) 

In [24]:
# 8: Remove punctuation
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: rmPunctuation(x))

In [25]:
# Make predTweet Column to have a minnigful sentence for predictor
tweet_df['predTweet']=tweet_df['CleanedTweet']

In [26]:
# Display results from earlier steps
tweet_df[['predTweet','CleanedTweet']]

Unnamed: 0,predTweet,CleanedTweet
0,Trump TikTok Must Sell Its American Operations...,Trump TikTok Must Sell Its American Operations...
1,atUser I need whatever the reporter is taking ...,atUser I need whatever the reporter is taking ...
2,atUser President Trump and the RNC keep sendin...,atUser President Trump and the RNC keep sendin...
3,atUser \nDonald Trump dumped million into his...,atUser \nDonald Trump dumped million into his...
4,This is how every single journalist should be ...,This is how every single journalist should be ...
...,...,...
9560,atUser We know Joe Biden love The Chinese Comm...,atUser We know Joe Biden love The Chinese Comm...
9561,atUser I like Trump but this was already in Ob...,atUser I like Trump but this was already in Ob...
9562,atUser Is there a person of sound mind in the ...,atUser Is there a person of sound mind in the ...
9563,Opinion We are only beginning to suffer the c...,Opinion We are only beginning to suffer the c...


In [27]:
# This is just to remove converted charactersets from a tweet that that will be used for prediction
def filterStopWords(text):
    filterwords = "multiexclamation multiquestion multistop url atuser am pm" 
    finalTokens = [] # all tokens
    tokens = text.split()
    for w in tokens:
        #print(w)
        w=w.lower()
        # 9. Remove some stopwords
        if (w not in filterwords):
            #print(w)
            final_word = w.lower()
            finalTokens.append(final_word)
    return " ".join(finalTokens)

In [28]:
# Filter stop words
tweet_df['predTweet']=tweet_df['predTweet'].apply(lambda x: filterStopWords(x))

In [29]:
# Display results from earlier steps
tweet_df[['predTweet','CleanedTweet']]

Unnamed: 0,predTweet,CleanedTweet
0,trump tiktok must sell its american operations...,Trump TikTok Must Sell Its American Operations...
1,need whatever the reporter taking keep his coo...,atUser I need whatever the reporter is taking ...
2,president trump and the rnc keep sending mail ...,atUser President Trump and the RNC keep sendin...
3,donald trump dumped million into his clubs in ...,atUser \nDonald Trump dumped million into his...
4,this how every single journalist should be tal...,This is how every single journalist should be ...
...,...,...
9560,we know joe biden love the chinese communities...,atUser We know Joe Biden love The Chinese Comm...
9561,like trump but this was already in obama care,atUser I like Trump but this was already in Ob...
9562,there person of sound mind in the usa democrat...,atUser Is there a person of sound mind in the ...
9563,opinion we are only beginning suffer the conse...,Opinion We are only beginning to suffer the c...


In [30]:
# Tokenizes a text to its words, removes and replaces some of them 
stoplist = stopwords.words('english')
my_stopwords = "multiexclamation multiquestion multistop gave url atuser st rd nd th am pm" # my extra stopwords
stoplist = stoplist + my_stopwords.split()
allowedWordTypes = ["J","R","V","N"] #  J is Adject, R is Adverb, V is Verb, N is Noun. These are used for POS Tagging
lemmatizer = WordNetLemmatizer() # set lemmatizer
stemmer = PorterStemmer() # set stemmer
spell = Speller(fast=True)

In [31]:
# Function to lower case words and remove stop words
def exCleanup(text):
    finalTokens = [] # all tokens
    # Get tokens
    tokens = nltk.word_tokenize(text)
    
    # 8. Finds "not" and antonym for the next word and if found, replaces not and the next word with the antonym
    #tokens = replaceNegations(text) 
        
    for w in tokens:
        w= w.lower()
        # 9. Remove stopwords
        if (w not in stoplist):
            # 10. lowercases all characters
            final_word = w.lower()
            finalTokens.append(final_word)

  
    return finalTokens

In [32]:
# Function for text processing - spell check, elongated words, lamatizr and stemming of data
def exSpeechTag(text):    
    finalTokens = [] # all tokens
    tokens = text #as data is alrady tokenized
    
    for w in tokens:

        if (w not in stoplist and len(w)>4):
            
            # 11. Finds a word with at least 3 characters capitalized and adds the tag ALL_CAPS_
            final_word = addCapTag(w)
           
            # 12. Replaces an elongated word with its basic form, unless the word exists in the lexicon
            final_word = replaceElongated(final_word)
          
            if len(final_word)>1:
                # 13. Correction of spelling errors
                final_word = spell(final_word)
                #print(final_word)
            # 14. lemmatizes words   
            final_word = lemmatizer.lemmatize(final_word)
            # 15. Apply stemming to words
            final_word = stemmer.stem(final_word)
                          
            finalTokens.append(final_word)
    return finalTokens

In [33]:
# Tokenize tweet
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: exCleanup(x))

In [34]:
# Spell check,replace elongated, lammatize and stemming
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(lambda x: exSpeechTag(x))

In [35]:
# Display results from earlier steps
tweet_df[['Tweet','CleanedTweet']]

Unnamed: 0,Tweet,CleanedTweet
0,Trump: TikTok Must Sell Its American Operation...,"[trump, tiktok, american, oper]"
1,@richardmarx I need whatever the reporter is t...,"[whatev, report, take, exercis, patienc, trump..."
2,@GOPChairwoman President Trump & the RNC keep ...,"[presid, trump, send, deceas, husband, tri]"
3,"@realDonaldTrump \n""Donald Trump dumped $400 m...","[donald, trump, dump, million, club, aberdeen,..."
4,This is how every single journalist should be ...,"[everi, singl, journalist, talk, everi, trump,..."
...,...,...
9560,@JoeBiden We know Joe Biden love The Chinese C...,"[bien, chines, commun, parti]"
9561,@KayaJones I like Trump but this was already i...,"[trump, alreadi, obama]"
9562,@JoeBiden Is there a person of sound mind in t...,"[person, sound, democrat, republican, think, b..."
9563,Opinion | We are only beginning to suffer the ...,"[opinion, begin, suffer, consequ, trump, failu..."


In [36]:
# Change date column format and export only required columns for further processing
tweet_df['Date'] = pd.to_datetime(tweet_df['Date'])
tweet_df['Date'] = tweet_df['Date'].dt.strftime('%m/%d/%Y')

In [37]:
#Let's clean orginal tweet for spell check 
tweet_df['Tweet']=tweet_df['Tweet'].apply(str).apply(lambda x: spell(x))

In [38]:
#Let's clean pred tweet for spell check, removing 
tweet_df['predTweet']=tweet_df['predTweet'].apply(str).apply(lambda x: spell(x))

In [39]:
#Let's clean Cleaned Tweet for spell check, removing 
tweet_df['CleanedTweet']=tweet_df['CleanedTweet'].apply(str).apply(lambda x: spell(x))

In [40]:
# Select only required columns
tweet_df=tweet_df[['TweetID','Date','Matched Keywords','User','Source','Followers','Friends','Favorite','Tweet','predTweet','CleanedTweet']]

In [41]:
# Display results
tweet_df.head()

Unnamed: 0,TweetID,Date,Matched Keywords,User,Source,Followers,Friends,Favorite,Tweet,predTweet,CleanedTweet
0,1290598653770575872,08/04/2020,Trump,genadamedia,GenadaMedia,1685.0,1642.0,0.0,Trump: TikTok Must Sell Its American Operation...,trump tiktok must sell its american operations...,"['trump', 'tiktok', 'american', 'over']"
1,1290598652847816706,08/04/2020,Trump,Carolin64234118,Twitter for Android,0.0,41.0,0.0,@richardmarx I need whatever the reporter is t...,need whatever the reporter taking keep his coo...,"['whaten', 'report', 'take', 'exercise', 'pati..."
2,1290598651966951424,08/04/2020,Trump,soulb4time,Twitter Web App,42.0,117.0,0.0,@GOPChairwoman President Trump & the RN keep s...,president trump and the rec keep sending mail ...,"['preside', 'trump', 'send', 'decease', 'husba..."
3,1290598649626599424,08/04/2020,Trump,Jan714,Twitter Web App,39.0,120.0,0.0,"@realDonaldTrump \n""Donald Trump dumped $400 m...",donald trump dumped million into his clubs in ...,"['donald', 'trump', 'dump', 'million', 'club',..."
4,1290598646740848640,08/04/2020,Trump,jocfanaccount,Twitter for iPhone,659.0,730.0,0.0,This is how every single journalist should be ...,this how every single journalist should be tal...,"['every', 'single', 'journalist', 'talk', 'eve..."


In [42]:
# Save Clean tweet data for charting and modelling
tweet_df.to_csv(cleaned_tweet_file,index=False)