In this notebook I will experiment with various preprocessing techniques and test their effect on SVM's performance.



In [8]:
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import spacy
spacy.load('en_core_web_sm')

import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

import numpy as np

import re



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olija\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
sentiment140Df = pd.read_csv("sentiment140.training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None, usecols=[0,5], names=["sentimentScore", "tweet"])
print("dataframe shape: " + str(sentiment140Df.shape) )

#  Sentiment column (0 = negative, 4 = positive)
print(sentiment140Df.columns.values)
# replace 4 with 1 (0 = negative, 1 = positive)
sentiment140Df['sentimentScore'] = sentiment140Df['sentimentScore'].replace(4,1)
def labelTrainingSentiment(score):
    if score == 0:
        return 'negative'
    if score == 1:
        return 'positive'


sentiment140Df['label'] = sentiment140Df['sentimentScore'].apply(lambda x: labelTrainingSentiment(x))
sentiment140Df.head(5)


dataframe shape: (1600000, 2)
['sentimentScore' 'tweet']


Unnamed: 0,sentimentScore,tweet,label
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,0,is upset that he can't update his Facebook by ...,negative
2,0,@Kenichan I dived many times for the ball. Man...,negative
3,0,my whole body feels itchy and like its on fire,negative
4,0,"@nationwideclass no, it's not behaving at all....",negative


In [7]:
# This emoji's dicitonary was taken from:
# https://www.kaggle.com/code/stoicstatic/twitter-sentiment-analysis-for-beginners?scriptVersionId=68181216

emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

htmlSymbols = {"&amp;": "and", "&quot;": '"', "&apos;": "'", "&cent;": "cent", "&pound;": "pound", "&yen;": "yen", "&euro;": "euro",
            "&copy;": "copyright", "&reg;": "registered trademark"}

# Abbreviation dictionary adapted from:
# https://www.socialmediatoday.com/social-networks/sarah-snow/2015-07-08/get-out-your-twittonary-twitter-abbreviations-you-must-know

abbreviationsDict = {'cc': 'carbon-copy', 'cx': 'correction', 'ct': 'cuttweet', 'dm': 'direct message', 'ht': 'heard through', 'mt': 'modified tweet', 
            'prt': 'please retweet', 'rt': 'retweet', 'sp': 'sponsored', 'em': 'email marketing', 'ezine': 'electronic magazine', 'fb': 'facebook', 
            'li': 'linkedin', 'seo': 'search engine optimization', 'sm': 'social media', 'smm': 'social media marketing', 'smo': 'social media optimization', 
            'sn': 'social network', 'sroi': 'social return on investment', 'ugc': 'user generated content', 'ux': 'user experience', 'yt': 'youtube', 
            'abt': 'about', 'afaik': 'as far as i know', 'ayfkmwts': 'are you fucking kidding me with this shit', 'b4': 'before', 'bfn': 'bye for now', 
            'bgd': 'background', 'bh': 'blockhead', 'br': 'best regards', 'btw': 'by the way', 'cd9': 'code 9', 'chk': 'check', 'cul8r': 'see you later', 
            'dam': 'donâ€™t annoy me', 'dd': 'dear daughter', 'df': 'dear fiance', 'ds': 'dear son', 'dyk': 'did you know', 'eml': 'email', 'ema': 'email address', 
            'f2f': 'face to face', 'ftf': 'face to face', 'ff': 'follow friday', 'ffs': "for fuck's sake", 'fml': 'fuck my life', 'fotd': 'find of the day', 
            'ftw': 'for the win', 'fubar': 'fucked up beyond all repair', 'fwiw': "for what it's worth", 'gmafb': 'give me a fucking break', 'gr8': 'great', 
            'gtfooh': 'get the fuck out of here', 'gts': 'guess the song', 'hagn': 'have a good night', 'hand': 'have a nice day', 'hotd': 'headline of the day', 
            'hth': 'hope that helps', 'ic': 'i see', 'icymi': 'in case you missed it', 'idk': "i don't know", 'iirc': 'if i remember correctly', 
            'imho': 'in my humble opinion', 'irl': 'in real life', 'iwsn': 'i want sex now', 'jk': 'joke', 'js': 'just saying', 'jsyk': 'just so you know', 
            'jv': 'joint venture', 'kk': 'okay', 'kyso': 'knock your socks off', 'lbs': 'laughing but serious', 'lhh': 'laugh hella hard', 'lmao': 'laughing my ass off', 
            'lmk': 'let me know', 'lo': 'little one', 'lol': 'laugh out loud', 'mm': 'music monday', 'mirl': 'meet in real life', 'mrjn': 'marijuana', 
            'msm': 'main stream media', 'mtf': 'more to follow', 'nbd': 'no big deal', 'nct': 'nobody cares though', 'nfi': 'no further information or not fucking interested', 
            'nfw': 'no fucking way', 'njoy': 'enjoy', 'nsfw': 'not safe for work', 'nts': 'note to self', 'oh': 'overheard', 'omfg': 'oh my fucking god', 
            'omg': 'oh my god', 'oomf': 'one of my friends', 'orly': 'oh really', 'plmk': 'please let me know', 'pnp': 'party and play', 'poidh': "pictures or it didn't happen", 
            'qotd': 'quote of the day', 're': 'in reply to', 'rlrt': 'real life retweet', 'rtfm': 'read the fucking manual', 'rtq': 'read the question', 'sfw': 'safe for work', 
            'smdh': 'shaking my damn head', 'smh': 'shaking my head', 'snafu': 'situation normal all fucked up', 'srs': 'serious', 'stfu': 'shut the fuck up', 
            'stfw': 'search the fucking web', 'tftf': 'thanks for the follow', 'tftt': 'thanks for this tweet', 'tj': 'tweetjack', 'tl': 'timeline', 'tldr': "too long didn't read", 
            'tmb': 'tweet me back', 'tt': 'trending topic', 'ty': 'thank you', 'tyia': 'thank you in advance', 'tyt': 'take your time', 'tyvm': 'thank you very much', 
            'w': 'with', 'wtv': 'whatever', 'ygtr': 'you got that right', 'ykwim': 'you know what i mean', 'ykyat': "you know you're addicted to", 
            'ymmv': 'your mileage may vary', 'yolo': 'you only live once', 'yoyo': "you're on your own", 'yw': "you're welcome"}


stops = stopwords.words('english')

def appendStopsNoApostrophes(wordList):
    newList = wordList
    for word in wordList:
        if word.find("'") != -1:
            newList.append(word.replace("'", ""))
            newList.remove(word)
    return newList

stops = appendStopsNoApostrophes(stops)

STOPWORDS = set(stops)

Working on the assumption that URLs user mentions add no meaning, remove these and any extra whitespace from the raw tweet data. Replace HTML symbols and emojis with english words.

In [88]:
def cleanTweet(tweet):
     # to lower case
    cleanTweet = tweet.lower()
    # remove user mentions
    cleanTweet = re.sub("(@[A-Za-z0-9_]+)","", cleanTweet)
    # remove urls
    cleanTweet = re.sub("https?://[A-Za-z0-9./]*","", cleanTweet)
    # replace common html symbols
    for key in htmlSymbols.keys():
        cleanTweet = cleanTweet.replace(key, htmlSymbols[key])
    #replace emojis with text translation
    for key in emojis.keys():
        cleanTweet = cleanTweet.replace(key, emojis[key])
     # remove non alpha characters
    cleanTweet = re.sub("[^a-zA-Z ]","", cleanTweet)
    #remove leading and trailing whitespace
    cleanTweet = cleanTweet.strip()
    # remove whitespaces
    cleanTweet = ' '.join(cleanTweet.split())
    return cleanTweet

def removeStops(tweet):
    return " ".join([word for word in str(tweet).split() if word not in STOPWORDS])


nlp = spacy.load("en_core_web_sm",  disable=['parser', 'ner'])

def lemmatize(tweet):
    # lemmatizing words
    doc = nlp(tweet)
    cleanTweet = " ".join([token.lemma_ for token in doc])
    return cleanTweet

def removeShortWords(tweet):
    return " ".join([word for word in str(tweet).split() if len(word) > 3])

# expand abbreviations / acronyms - translate common abbreviations to their translation
def translateAbbreviations(tweet):
    cleanTweet = tweet
    for key in abbreviationsDict.keys():
        cleanTweet = cleanTweet.replace(key, abbreviationsDict[key])


# method to replace negation can't --> can not

# method for repeated letter normalization = goooooooood --> good (no english word has mroe than two consecutive letters the same)


In [15]:
sentiment140Df['cleanTweet'] = sentiment140Df['tweet'].apply(lambda x:cleanTweet(x))
sentiment140Df.head(100)

Unnamed: 0,sentimentScore,tweet,label,cleanTweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,awww thats a bummer you shoulda got david car...
1,0,is upset that he can't update his Facebook by ...,negative,is upset that he cant update his facebook by t...
2,0,@Kenichan I dived many times for the ball. Man...,negative,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,negative,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",negative,no its not behaving at all im mad why am i her...
...,...,...,...,...
95,0,Strider is a sick little puppy http://apps.fa...,negative,strider is a sick little puppy
96,0,"so rylee,grace...wana go steve's party or not?...",negative,so ryleegracewana go steves party or not sadly...
97,0,"hey, I actually won one of my bracket pools! T...",negative,hey i actually won one of my bracket pools too...
98,0,"@stark YOU don't follow me, either and i work...",negative,you dont follow me either and i work for you
