In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import math
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
train = pd.read_csv('processed_tweets_stemm/train.csv')
validation = pd.read_csv('processed_tweets_stemm/validation.csv')
test = pd.read_csv('processed_tweets_stemm/test.csv')

In [4]:
train.rename(columns={'texto_procesado': 'texto_normalizado'}, inplace=True)
validation.rename(columns={'texto_procesado': 'texto_normalizado'}, inplace=True)
test.rename(columns={'texto_procesado': 'texto_normalizado'}, inplace=True)

In [5]:
train.dropna(subset=['texto_normalizado'], inplace=True)
validation.dropna(subset=['texto_normalizado'], inplace=True)
test.dropna(subset=['texto_normalizado'], inplace=True)

In [6]:
def conteo_palabras(tweets):
    conteo_palabras = defaultdict(int)

    for tweet in tweets:
        tokens = word_tokenize(tweet)

        for token in tokens:
            conteo_palabras[token] += 1
    
    return conteo_palabras

conteo_palabras_positivas = conteo_palabras(train[train['sentimiento'] == 2]['texto_normalizado'])

conteo_palabras_negativas = conteo_palabras(train[train['sentimiento'] == 1]['texto_normalizado'])

conteo_palabras_neutrales = conteo_palabras(train[train['sentimiento'] == 0]['texto_normalizado'])

In [7]:
palabras_set = set(conteo_palabras_positivas.keys()).union(set(conteo_palabras_negativas.keys()))
palabras_set = palabras_set.union(set(conteo_palabras_neutrales.keys()))
palabras_set

{"'virgen",
 "'vad",
 "'suscripcion",
 "'creador",
 "'liderazg",
 "'chul",
 "'aranjuez",
 "'membret",
 "'boletin",
 "'exprim",
 "'ucid",
 "'enrer",
 "'to2",
 "'nomequeriadespert",
 "'pais.y",
 "'desus",
 "'antoni",
 "'reviv",
 "'ultim",
 "'beats",
 "'garch",
 "'publicit",
 "'empiez",
 "'division",
 "'ascazonivelmaxim",
 "'ahahahahah",
 "'merend",
 "'descoloc",
 "'virtud",
 "'baratit",
 "'sism",
 "'delvpt",
 "'buchisap",
 "'roqy",
 "'determin",
 "'conflict",
 "'sobrecog",
 "'estrell",
 "'20h.act",
 "'resum",
 "'literal",
 "'gordeon",
 "'taz",
 "'nervi",
 "'soseurop",
 "'anteproyect",
 "'extend",
 "'lissavetzky",
 "'llor",
 "'fueg",
 "'nell",
 "'cameron",
 "'supend",
 "'silmarillion",
 "'norz",
 '0-0',
 "'29m",
 "'acos",
 "'marisc",
 "'chel",
 "'barc",
 "'krat",
 "'curi",
 "'chever",
 "'oracul",
 "'robert",
 "'amor",
 "'city",
 "'trimestr",
 "'enhorabuena",
 "'materializ",
 "'mucha",
 "'mu",
 "'moller",
 "'precios",
 "'taan",
 "'norteamerican",
 "'hormiguer",
 "'sinvergüenz",
 "'vison",


In [8]:
def count_tweets(tweets, ys):
    result = {}

    for y, tweet in zip(ys, tweets):
        for word in word_tokenize(tweet):
            pair = (word,y)

            if pair in result:
                result[pair] += 1

            else:
                result[pair] = 1

    return result

In [9]:
def lookup(freqs, word, label):
    n = 0  

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [10]:
def train_naive_bayes1(freqs, train_x, train_y):
    loglikelihood_pos = {}
    loglikelihood_neg = {}
    loglikelihood_neu = {}

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = N_neu = 0
    for pair in freqs.keys():
        if pair[1] == 2:
            N_pos += freqs[pair]
        elif pair[1] == 1:
            N_neg += freqs[pair]
        else:
            N_neu += freqs[pair]
    
    logprior_pos = np.log(N_pos / (N_pos + N_neg + N_neu))
    logprior_neg = np.log(N_neg / (N_pos + N_neg + N_neu))
    logprior_neu = np.log(N_neu / (N_pos + N_neg + N_neu))

    for word in vocab:
        freq_pos =lookup(freqs,word,2)
        freq_neg =lookup(freqs,word,1)
        freq_neu =lookup(freqs,word,0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        p_w_neu = (freq_neu + 1) / (N_neu + V)

        loglikelihood_pos[word] = np.log(p_w_pos) - np.log(p_w_neg + p_w_neu)
        loglikelihood_neg[word] = np.log(p_w_neg) - np.log(p_w_pos + p_w_neu)
        loglikelihood_neu[word] = np.log(p_w_neu) - np.log(p_w_pos + p_w_neg)


    return loglikelihood_pos, loglikelihood_neg, loglikelihood_neu, logprior_pos, logprior_neg, logprior_neu

In [11]:
def naive_bayes_predict1(tweet, logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos,loglikelihood_neg, loglikelihood_neu):
    word_l = word_tokenize(tweet)

    p_pos = logprior_pos
    p_neg = logprior_neg
    p_neu = logprior_neu

    for word in word_l:
        if word in loglikelihood_pos:
            p_pos += loglikelihood_pos[word]
        if word in loglikelihood_neg:
            p_neg += loglikelihood_neg[word]
        if word in loglikelihood_neu:
            p_neu += loglikelihood_neu[word]
            
    return {2: p_pos, 1: p_neg, 0: p_neu}


In [12]:
freqs1 = count_tweets(train.texto_normalizado, train.sentimiento)
loglikelihood_pos, loglikelihood_neg, loglikelihood_neu, logprior_pos, logprior_neg, logprior_neu = train_naive_bayes1(freqs1, train.texto_normalizado, train.sentimiento)

In [13]:
validation['prediccion'] = validation['texto_normalizado'].apply(lambda x: max(naive_bayes_predict1(x,logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos,loglikelihood_neg, loglikelihood_neu), key=naive_bayes_predict1(x, logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos,loglikelihood_neg, loglikelihood_neu).get))

In [14]:
validation

Unnamed: 0.1,Unnamed: 0,texto,sentimiento,texto_normalizado,prediccion
0,7450,@marianorajoy Estamos muy satisfechos,2,['satisfech'],0
1,1332,Nada mejor que pasar la navidad con la familia...,2,"['mejor', 'pas', 'navid', 'famili', 'amig', 'u...",2
2,6436,Planeta creativo: nuevo placer otra interesant...,2,"['planet', 'creativ', 'nuev', 'plac', 'interes...",2
3,8973,Necesito ir a mi hogar y dormir con mis dos hi...,1,"['necesit', 'ir', 'hog', 'dorm', 'dos', 'hij',...",1
4,803,En la XII edición Premios Culturas de Extremad...,2,"['xii', 'edicion', 'premi', 'cultur', 'extrema...",2
...,...,...,...,...,...
3409,15367,Bajando a Calahorra ya no queda nada para el c...,2,"['baj', 'calahorr', 'qued', 'cambi', 'junt', '...",0
3410,10591,"Es que estos dolores, son bárbaros...",1,"['dolor', 'barbar', '...']",1
3411,2111,"#ChacónenLaSER : ""Me siento con capacidad y co...",2,"['chaconenlas', 'sient', 'capac', 'equip', 'su...",1
3412,14924,La Prima de Riesgo es 1 estafa de los Mercados...,0,"['prim', 'riesg', '1', 'estaf', 'merc', 'bast'...",1


In [15]:
validation[validation['sentimiento'] == validation['prediccion']].prediccion.value_counts()

prediccion
1    808
2    759
0    397
Name: count, dtype: int64

In [16]:
validation[validation['sentimiento'] != validation['prediccion']].sentimiento.value_counts()

sentimiento
0    704
1    374
2    372
Name: count, dtype: int64

In [17]:

# Matriz de confusión
cm = confusion_matrix(validation['sentimiento'], validation['prediccion'])

precision = precision_score(validation['sentimiento'], validation['prediccion'], average=None)
recall = recall_score(validation['sentimiento'], validation['prediccion'], average=None)
accuracy = accuracy_score(validation['sentimiento'], validation['prediccion'])

f1 = f1_score(validation['sentimiento'], validation['prediccion'], average=None)

print("Matriz de confusión:")
print(cm)
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


Confusion Matrix:
[[397 398 306]
 [219 808 155]
 [179 193 759]]
Precision: [0.49937107 0.5775554  0.62213115]
Recall: [0.36058129 0.68358714 0.67108753]
Accuracy: 0.5752782659636789
F1 Score: [0.41877637 0.62611391 0.64568269]


## Modelo sin neutrales

In [18]:
train_nb = train[train['sentimiento'] != 0]
validation_nb = validation[validation['sentimiento'] != 0]

In [19]:
train_nb['sentimiento'] = train_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)
validation_nb['sentimiento'] = validation_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_nb['sentimiento'] = train_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_nb['sentimiento'] = validation_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)


In [20]:
train_nb

Unnamed: 0.1,Unnamed: 0,texto,sentimiento,texto_normalizado
0,10221,"@Manuellflorod Bienvenida (triste) realidad, a...",0,"['bienven', 'trist', 'realid', 'andam', 'mism']"
1,9565,Estar en los brazos de mi novio es lo único qu...,1,"['braz', 'novi', 'unic', 'quier', 'necesit']"
3,7713,@ahorapodemos @ierrejon Tambien que las empres...,1,"['tambi', 'empres', 'independient', 'contact',..."
5,1038,El tono duro y sin concesiones de la réplica d...,0,"['ton', 'dur', 'concesion', 'replic', 'rajoy',..."
7,10,Bdías. EM no se ira de puente. Si vosotros os ...,1,"['bdi', 'em', 'ira', 'puent', 'si', 'vais', 'd..."
...,...,...,...,...
10236,11285,"No, solo quiero bailarte la medusa loca en pri...",1,"['sol', 'quier', 'bailart', 'medus', 'loc', 'p..."
10237,11965,@cuervotinelli lo único que te puse fue que qu...,0,"['unic', 'pus', 'quer', 'jug', 'lol', 'tortug'..."
10238,5390,La Audiencia Nacional condena a 20 años de pri...,0,"['audienci', 'nacional', 'conden', '20', 'años..."
10239,860,RT @eP_Titulares: #Sociedad Muere apuñalada la...,0,"['socied', 'muer', 'apuñal', 'propietari', 'za..."


In [21]:
def lookup(freqs, word, label):
    n = 0  

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [22]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0
    vocab = palabras_set
    V = len(vocab)
    N_pos = N_neg = 0

    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]

        else:
            N_neg += freqs[pair]

    D = len(train_y)
    D_pos =np.sum(train_y)
    D_neg = D-D_pos
    logprior =np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos =lookup(freqs,word,1)
        freq_neg =lookup(freqs,word,0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        loglikelihood[word] = np.log(p_w_pos/p_w_neg)


    return logprior, loglikelihood

In [23]:
freqs = count_tweets(train_nb.texto_normalizado, train_nb.sentimiento)

In [24]:
logprior, loglikelihood = train_naive_bayes(freqs, train_nb.texto_normalizado, train_nb.sentimiento)

In [25]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = word_tokenize(tweet)
    p = 0
    p += logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p

In [26]:
my_tweet = 'pesimo horrible bueno hermoso'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)

In [27]:
p

-0.01163480256770022

In [28]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0 
    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        y_hats.append(y_hat_i)

    error = np.mean(np.absolute(y_hats-test_y))
    accuracy = 1-error

    return accuracy

In [29]:
test_naive_bayes(validation_nb.texto_normalizado, validation_nb.sentimiento, logprior, loglikelihood)

0.7929096411586684

In [30]:
validation_nb['prediccion'] = validation_nb.texto_normalizado.apply(lambda x: 1 if naive_bayes_predict(x, logprior, loglikelihood) > 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_nb['prediccion'] = validation_nb.texto_normalizado.apply(lambda x: 1 if naive_bayes_predict(x, logprior, loglikelihood) > 0 else 0)


In [31]:
validation_nb['puntaje'] = validation_nb.texto_normalizado.apply(lambda x: naive_bayes_predict(x, logprior, loglikelihood))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_nb['puntaje'] = validation_nb.texto_normalizado.apply(lambda x: naive_bayes_predict(x, logprior, loglikelihood))


In [32]:
validation_nb

Unnamed: 0.1,Unnamed: 0,texto,sentimiento,texto_normalizado,prediccion,puntaje
0,7450,@marianorajoy Estamos muy satisfechos,1,['satisfech'],1,0.800715
1,1332,Nada mejor que pasar la navidad con la familia...,1,"['mejor', 'pas', 'navid', 'famili', 'amig', 'u...",1,3.152403
2,6436,Planeta creativo: nuevo placer otra interesant...,1,"['planet', 'creativ', 'nuev', 'plac', 'interes...",1,6.140234
3,8973,Necesito ir a mi hogar y dormir con mis dos hi...,0,"['necesit', 'ir', 'hog', 'dorm', 'dos', 'hij',...",0,-1.191930
4,803,En la XII edición Premios Culturas de Extremad...,1,"['xii', 'edicion', 'premi', 'cultur', 'extrema...",1,5.416284
...,...,...,...,...,...,...
3408,8116,@nataliaprzc Con Liberación se vive mejor,1,"['liber', 'viv', 'mejor']",0,-0.339169
3409,15367,Bajando a Calahorra ya no queda nada para el c...,1,"['baj', 'calahorr', 'qued', 'cambi', 'junt', '...",1,1.845549
3410,10591,"Es que estos dolores, son bárbaros...",0,"['dolor', 'barbar', '...']",0,-3.061629
3411,2111,"#ChacónenLaSER : ""Me siento con capacidad y co...",1,"['chaconenlas', 'sient', 'capac', 'equip', 'su...",0,-0.282432


In [33]:
len(validation_nb[validation_nb['sentimiento'] == validation_nb['prediccion']])

1834

In [35]:
#Matriz de confusión
cm = confusion_matrix(validation_nb['sentimiento'], validation_nb['prediccion'])

precision = precision_score(validation_nb['sentimiento'], validation_nb['prediccion'], average=None)
recall = recall_score(validation_nb['sentimiento'], validation_nb['prediccion'], average=None)
accuracy = accuracy_score(validation_nb['sentimiento'], validation_nb['prediccion'])

f1 = f1_score(validation_nb['sentimiento'], validation_nb['prediccion'], average=None)

print("Matriz de confusión:")
print(cm)
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


Confusion Matrix:
[[966 216]
 [263 868]]
Precision: [0.78600488 0.80073801]
Recall: [0.81725888 0.76746242]
Accuracy: 0.7929096411586684
F1 Score: [0.80132725 0.78374718]
