In [4]:
import string
import copy

import pandas as pd
import numpy as np
import nltk
import re

import IPython
from IPython.display import clear_output

# Emoji library (for demojization)
import emoji
emojis = list(emoji.EMOJI_DATA.keys())

# Language Detection
import spacy
import spacy_fastlang # is used
from spacy_langdetect import LanguageDetector

# Stopwords to remove
from nltk.corpus import stopwords as sw
stopwords = sw.words('English')
stopwords.remove('not')

from sklearn import feature_extraction, model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, ensemble, tree

import xgboost
import warnings
warnings.filterwarnings('ignore')

In [5]:
def handle_emojis(lst):
    res = []
    for sentence in lst:
        words = sentence.split(' ')
        new_words = []
        for word in words:
            for char in word:
                if char in emojis:
                    word.replace(char, "")
            new_words.append(word)
        sentence = ' '.join(new_words)
        res.append(sentence)
    return res
        

def clean_sentences(lst):
    ''' Cleans up a list of tweets.
    Removes: Links, tags, retweets, emojis'''
    res = []
    for sentence in lst:
        sentence = sentence.lower()     # lower-cases sentence
        
        words = sentence.split(' ')
        new_words = copy.deepcopy(words)
        
        for word in words: # Iterates through each word in the tweet
            if len(word) == 0:
                new_words.remove(word)
            elif word[:4] == "http":    # Removes links
                new_words.remove(word)
            elif word[0] == "@":        # Removes tags
                new_words.remove(word)
            elif word in stopwords:     # Removes stopwords
                new_words.remove(word)
            elif word[:2] == "rt":      # Removes retweets
                new_words.remove(word)
            elif word[:2] == "\n":      # Removes line breaks
                new_words.remove(word)
                                        
        sentence = " ".join(new_words)
        # sentence = re.sub(string.punctuation, '', sentence) # Removes punctuation
        for chr in string.punctuation:
            sentence = sentence.replace(chr, "")
        
        res.append(sentence)
    return res

def clean_words(lst):
    ''' Removes: empty spaces'''
    res = []
    for sentence in lst:
        words = sentence.split(' ')
        new_words = []
        
        for word in words:
            for char in word: # iterates through each character in the tweet
                if char == " ":
                    word.replace(char, "")
            new_words.append(word)
            
        sentence = " ".join(new_words)
        res.append(sentence) 
    return res
    
def remove_non_english(lst):
    ''' Removes all content that is NOT english from a list of tweets'''
    langs = []
    res = copy.deepcopy(lst)
    
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("language_detector")
    
    for item in lst: # iterates through each sentence
        doc = nlp(item)
        lang = doc._.language
        if lang != 'en':
            res.remove(item)        # removes non-english sentences
        langs.append(lang)
    # lang_labels['language'] = langs # secondary side-effect
    return res

def stem(lst):
    '''Stems words to use basic stem word (e.g turn instead of turning)'''
    ps = nltk.stem.PorterStemmer()
    res = []
    for sentence in lst:
        new_sentence = []
        for word in sentence.split():
            new_word = ps.stem(word)
            new_sentence.append(new_word)
        new_sentence = " ".join(new_sentence)
        res.append(new_sentence)
    return res

In [6]:
raw = pd.read_csv('data/labeled/sample.csv', header=0)
data = copy.deepcopy(list(raw['text']))

In [7]:
lang_labels = pd.DataFrame(data, columns=['tweet content'])

data = handle_emojis(data)
data = clean_sentences(data)
# data = remove_non_english(data)
data = stem(data)

In [8]:
raw['text'] = data

## Basic Model

In [9]:
FORECAST_MODE = False

In [10]:
cv = feature_extraction.text.CountVectorizer()

X = cv.fit_transform(raw['text']).toarray()
y = raw['sentiment']

In [11]:
dict_classifiers = {
    'xgb': xgboost.XGBClassifier(),
    'gaussian_nb': naive_bayes.GaussianNB(),
    'multi_nb': naive_bayes.MultinomialNB(),
    'rforest': ensemble.RandomForestClassifier(),
    'entropy_rforest': ensemble.RandomForestClassifier(criterion='entropy'),
    'decision_tree': tree.DecisionTreeClassifier()
}

choice = 'gaussian_nb'

classifier = dict_classifiers[choice]

In [12]:
if not FORECAST_MODE:
    for name in dict_classifiers:
        clf = dict_classifiers[name]
        scores = scores = model_selection.cross_val_score(clf, X, y, cv=5)
        score = np.average(scores)
        print(f"Score of {name}: {score}")
else:
    # Export stuff
    pass

Score of xgb: 0.9811738648947952
Score of gaussian_nb: 0.8638981173864895
Score of multi_nb: 0.9337763012181617
Score of rforest: 0.9811738648947952
Score of entropy_rforest: 0.9811738648947952
Score of decision_tree: 0.9765227021040974
