In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
import re
import textblob
import nltk

# 1 - Data-Set

In [4]:
FILE_PATH = './TextMinning/Tweets_EN_sentiment.json'
DF = pd.read_json(FILE_PATH, orient = 'records', lines = True, encoding='utf8')

DF['label'] = np.where(DF['class']=='pos', 1, -1)
DF = DF.drop(['tweet', 'class'], axis = 1)
DF['text_len'] = [len(x.split()) for x in DF['text']]
DF = DF[DF['text_len'] > 2] 

# 2 - Pre-Processing 

The code for ours PPTs is made with variations of the Cell underneath.

In [6]:
#CODE FROM:
#https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
from bs4 import BeautifulSoup

#stemmer = nltk.stem.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()
snowball_stemmer = nltk.stem.SnowballStemmer('english')

tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def tweet_cleaner(text):
    #text = text.lower()
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    #stripped = re.sub(combined_pat, '', souped)
    #try:
    #    clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    #except:
    #   clean = stripped
    #letters_only = re.sub("[^a-zA-Z]", " ", clean)
    #letters_only = re.sub("[^a-zA-Z]", " ", souped)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tokenizer.tokenize(souped)
    #stemmed = [stemmer.stem(word) for word in words]
    lemmed = [lemmer.lemmatize(word) for word in words]
    #snowball = [snowball_stemmer.stem(word) for word in words]
    return lemmed
   
DF['text_cleaned'] = DF['text'].apply(lambda x: tweet_cleaner(x))

In [None]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    print(s)
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    print(tokens)
    return tokens

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
DF['text_cleaned'] = DF['text_cleaned'].apply(lambda x: [word for word in x if word not in stop_words])

# 3 - TextBlob over whole DS

In [11]:
DF['label_TB'] = DF['text'].apply(lambda x: textblob.TextBlob(x).sentiment.polarity)
DF['label_TB'] = np.where(DF['label_TB']>=0, 1, -1)

In [None]:
def acc_mesure(LABELS, TESTS):
    #print(LABELS.value_counts())
    ACC = LABELS - TESTS
    TOTAL = len(LABELS)
    print('\nACC from - (', LABELS.name, '/', TESTS.name, ')- total-', TOTAL)
    REAL_POS = len(LABELS[LABELS == 1])
    print(LABELS.name, '- positive -', REAL_POS, '- negative -', TOTAL - REAL_POS)
    TESTS_POS = len(TESTS[TESTS == 1])
    print(TESTS.name,'- positive -', TESTS_POS, '- negative -', TOTAL - TESTS_POS)
    HITS = len(ACC[ACC == 0])  
    print('Acurracy % of', round((HITS / TOTAL), 3)*100)

In [12]:
#acc_mesure(DF['label'], DF['label_TB'])
import sklearn.metrics as metrics
print("Accuracy: ", metrics.accuracy_score(DF['label'], DF['label_TB']))

Accuracy:  0.7968168752113629


# 4 - Lexicon Benchmark

In [13]:
LEXICON_DF = pd.read_csv('./TextMinning/NCR-lexicon.csv', delimiter=';')
LEXICON_DF['Value'] = ((LEXICON_DF['Negative'] * -1) + LEXICON_DF['Positive'])
LEXICON = LEXICON_DF[['English', 'Value']].set_index('English')['Value'].to_dict()

In [14]:
print(len(LEXICON))

counts = []
counts_lex = []

def apply_lex(tokens):
    lexSum = 0
    #tokens = nltk.bigrams(tokens)
    for word in tokens:
        counts.append(word)
        if word in LEXICON.keys():
            lexSum += LEXICON.get(word)
            counts_lex.append(word)
    return lexSum

14182


In [15]:
DF['lex_count'] = DF['text_cleaned'].apply(lambda row: apply_lex(row))
DF['label_lex'] = np.where(DF['lex_count'] >= 0, 1, -1)

import sklearn.metrics as metrics
print("Accuracy: ", metrics.accuracy_score(DF['label'], DF['label_lex']))

Accuracy:  0.7824230639161313


In [None]:
print(len(counts), len(counts_lex))
freqCounts = nltk.FreqDist(counts)
freqCountLex = nltk.FreqDist(counts_lex)
print(len(freqCounts), len(freqCountLex))

# 5 - Vectorize - With CountVector()

In [7]:
import collections
def count_words(tweets):
    WORDS_COUNT = {}
    for tweet in tweets:
        #HERE DEFINE IF BIGRAMS OR NOT!!!!
        #tweet = nltk.bigrams(tweet)
        for word in tweet:
            if word in WORDS_COUNT.keys():
                WORDS_COUNT[word] += 1
            else:
                WORDS_COUNT[word] = 1
    return collections.Counter(WORDS_COUNT)

def vectorize(tweet, freqs):
    valores = np.zeros([len(freqs)])
    for word in tweet:
        if word in freqs:
            #valores[freqs.get(word)] = 1 #SE VECTOR DE 1 OU 0
            valores[freqs.get(word)] = valores[freqs.get(word)] + 1 
    return valores

COUNT_VECT_DICT = count_words(DF['text_cleaned'])

# 6 - Experience

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import sklearn.metrics as metrics
ME_RES = []
SVD_RES = []
NB_RES = []

for value in [500, 1000, 2500, 5000, 7500, 10000, 12500, 15000, 17500, 20000]:
        
        print('Experience with', value, 'most used words.')
        EXP_DICT = COUNT_VECT_DICT.most_common(value)
        INDEX_DICT = {}
        
        print('Vectorizing Tweets!!!')
        for i, word in enumerate(EXP_DICT):
            INDEX_DICT[EXP_DICT[i][0]] = i 
        DF['vector'] = DF['text_cleaned'].apply(lambda row: vectorize(row, INDEX_DICT))

        print('Train/Test Splinting')
        X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = sk.model_selection.train_test_split(
                list(DF.vector),
                list(DF.label),
                train_size=0.80,
                test_size=0.20,
                random_state=1234)

        lr = LogisticRegression()
        model = lr.fit(X_TRAIN, Y_TRAIN)
        y_pred = model.predict(X_TEST)
        print('LogisticRegression - Accuracy:',  metrics.accuracy_score(Y_TEST, y_pred))
        ME_RES.append(metrics.accuracy_score(Y_TEST, y_pred))

        svmc = LinearSVC(max_iter=1000)
        model = svmc.fit(X_TRAIN, Y_TRAIN)
        y_pred = model.predict(X_TEST)
        print('Linear SVD - Accuracy: ', metrics.accuracy_score(Y_TEST, y_pred))
        SVD_RES.append(metrics.accuracy_score(Y_TEST, y_pred))

        nb = MultinomialNB()
        model = nb.fit(X_TRAIN, Y_TRAIN)
        y_pred = model.predict(X_TEST)
        print('Multinomial Naive Bayes - Accuracy: ', metrics.accuracy_score(Y_TEST, y_pred))
        NB_RES.append(metrics.accuracy_score(Y_TEST, y_pred))
        
        print('\n')

Experience with 500 most used words.
Vectorizing Tweets!!!
Train/Test Splinting
LogisticRegression - Accuracy: 0.8451865159040474
Linear SVD - Accuracy:  0.8444467927718482
Multinomial Naive Bayes - Accuracy:  0.839374405579626


Experience with 1000 most used words.
Vectorizing Tweets!!!
Train/Test Splinting
LogisticRegression - Accuracy: 0.8501532283630984
Linear SVD - Accuracy:  0.8494135052308993
Multinomial Naive Bayes - Accuracy:  0.840114128711825


Experience with 2500 most used words.
Vectorizing Tweets!!!
Train/Test Splinting
LogisticRegression - Accuracy: 0.8513156504279826
Linear SVD - Accuracy:  0.8505759272957836
Multinomial Naive Bayes - Accuracy:  0.8455035401035612


Experience with 5000 most used words.
Vectorizing Tweets!!!
Train/Test Splinting
LogisticRegression - Accuracy: 0.8530064461587235
Linear SVD - Accuracy:  0.8420162739089084
Multinomial Naive Bayes - Accuracy:  0.8496248546972419


Experience with 7500 most used words.
Vectorizing Tweets!!!
Train/Test Spli

# SkLearn Models

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
#tokenize = TweetTokenizer(strip_handles=True, reduce_len=True)
#tokenize = lambda doc: doc.lower().split(" ")

#sklearn_tfidf = TfidfVectorizer(norm='l2', min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_tfidf = TfidfVectorizer()
sklearn_representation = sklearn_tfidf.fit_transform(list(DF['text']))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
tokenize = lambda doc: doc.lower().split(" ")

sklearn_count = CountVectorizer(analyzer='word', binary=False, encoding='utf-8', 
                                input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
sklearn_representation = sklearn_count.fit_transform(list(DF['text']))

In [17]:
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = sk.model_selection.train_test_split(
        sklearn_representation,
        list(DF.label),
        train_size=0.80,
        test_size=0.20,
        random_state=1234)

In [18]:
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

lr = LogisticRegression()
model = lr.fit(X_TRAIN, Y_TRAIN)
y_pred = model.predict(X_TEST)
print("Accuracy: ", metrics.accuracy_score(Y_TEST, y_pred))

Accuracy:  0.8572334354855754


In [19]:
from sklearn.svm import LinearSVC
import sklearn.metrics as metrics

svmc = LinearSVC(max_iter=1000)
model = svmc.fit(X_TRAIN, Y_TRAIN)
y_pred = model.predict(X_TEST)
print("Accuracy: ", metrics.accuracy_score(Y_TEST, y_pred))

Accuracy:  0.858501532283631


In [20]:
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics

nb = MultinomialNB()
model = nb.fit(X_TRAIN, Y_TRAIN)
y_pred = model.predict(X_TEST)
print("Accuracy: ", metrics.accuracy_score(Y_TEST, y_pred))

Accuracy:  0.8329282468561767


# TF-IDF

In [None]:
import collections
#docs = []

def tf_idf(tweet):
    doc = collections.Counter() 
    for word in tweet:
         doc[word] += 1  
    return(doc)

TDIDF_COUNT = DF['text_cleaned'].apply(lambda row: tf_idf(row))

def tf_idf2(docs):
    tf = collections.Counter()
    df = collections.Counter()
    for d in docs:
        #print(type(d))
        for w in d:
            tf[w] += d[w]
            df[w] += 1
    idfs = {}
    for w in tf:
        if tf[w] > 2:
            idfs[w] = np.log(len(DF)/df[w])
    #print(idfs)
    return sorted(idfs, key=idfs.get, reverse=True)

DICT = tf_idf2(list(DF['tfidf']))

In [None]:
def vectorize(tweet, freqs):
    print(freqs)
    valores = np.zeros([len(freqs)])
    for word in tweet:
        if word in freqs:
            valores[freqs.get(word)] = 1 #SE VECTOR DE 1 OU 0
            #valores[freqs.get(word)] = valores[freqs.get(word)] + 1 
    return valores

INDEX_DICT

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import sklearn.metrics as metrics

ME_RES = []
SVD_RES = []
NB_RES = []

for value in [500, 1000, 2500, 5000, 7500, 10000, 12500, 15000, 17500, 20000]:
    
        print('Experience with', value, 'most used words.')
        #EXP_DICT = DICT.most_common(value)
        EXP_DICT = tf_idf2(list(DF['tfidf']))
        EXP_DICT = EXP_DICT[:value]
        INDEX_DICT = {}
        
        print('Vectorizing Tweets!!!')
        for i, word in enumerate(EXP_DICT):
            INDEX_DICT[EXP_DICT[i][0]] = i 
        DF['vector'] = DF['text_cleaned'].apply(lambda row: vectorize(row, INDEX_DICT))

        print('Train/Test Splinting')
        X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = sk.model_selection.train_test_split(
                list(DF.vector),
                list(DF.label),
                train_size=0.80,
                test_size=0.20,
                random_state=1234)

        lr = LogisticRegression()
        model = lr.fit(X_TRAIN, Y_TRAIN)
        y_pred = model.predict(X_TEST)
        print('LogisticRegression - Accuracy:',  metrics.accuracy_score(Y_TEST, y_pred))
        ME_RES.append(metrics.accuracy_score(Y_TEST, y_pred))

        svmc = LinearSVC(max_iter=1000)
        model = svmc.fit(X_TRAIN, Y_TRAIN)
        y_pred = model.predict(X_TEST)
        print('Linear SVD - Accuracy: ', metrics.accuracy_score(Y_TEST, y_pred))
        SVD_RES.append(metrics.accuracy_score(Y_TEST, y_pred))

        nb = MultinomialNB()
        model = nb.fit(X_TRAIN, Y_TRAIN)
        y_pred = model.predict(X_TEST)
        print('Multinomial Naive Bayes - Accuracy: ', metrics.accuracy_score(Y_TEST, y_pred))
        NB_RES.append(metrics.accuracy_score(Y_TEST, y_pred))
        
        print('\n')