In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import math
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
train = pd.read_csv('processed_tweets_lemma/train.csv')
validation = pd.read_csv('processed_tweets_lemma/validation.csv')
test = pd.read_csv('processed_tweets_lemma/test.csv')

In [5]:
train.dropna(subset=['texto_normalizado'], inplace=True)
validation.dropna(subset=['texto_normalizado'], inplace=True)
test.dropna(subset=['texto_normalizado'], inplace=True)

In [6]:
def conteo_palabras(tweets):
    conteo_palabras = defaultdict(int)

    for tweet in tweets:
        tokens = word_tokenize(tweet)

        for token in tokens:
            conteo_palabras[token] += 1
    
    return conteo_palabras

conteo_palabras_positivas = conteo_palabras(train[train['sentimiento'] == 2]['texto_normalizado'])

conteo_palabras_negativas = conteo_palabras(train[train['sentimiento'] == 1]['texto_normalizado'])

conteo_palabras_neutrales = conteo_palabras(train[train['sentimiento'] == 0]['texto_normalizado'])

In [7]:
palabras_set = set(conteo_palabras_positivas.keys()).union(set(conteo_palabras_negativas.keys()))
palabras_set = palabras_set.union(set(conteo_palabras_neutrales.keys()))
palabras_set

{'tun',
 'dulce',
 'rbca',
 'consultarlas',
 'conderuiz',
 'mobytohglajs',
 'ño',
 'trabajo',
 'guitarrista',
 'prat',
 'multicolor',
 'clinico',
 'generador',
 'mafia',
 'firmarmelo',
 'definido',
 'mitinaluche',
 'indignado',
 'papaya',
 'tndrian',
 'ramo',
 'madrugastes',
 'oposición',
 'vivos',
 'kindle',
 'sabra',
 'mary',
 'economicamente',
 'rprodciendo',
 'niemeyer',
 'encontrarmelo',
 'participante',
 'bautismo',
 'chivasme',
 'mucuaa',
 'sumarunir',
 'custodia',
 'rociero',
 'careta',
 'marido',
 'pasivort',
 'selva',
 'reinsercion',
 'rabane',
 'condenado',
 'bunker',
 'mishima',
 'servicio',
 'paleorefutado',
 'chikorita',
 'restaurant',
 'añoras',
 'abundancia',
 'oleee',
 'prostituido',
 'tag',
 'redescubrir',
 'acostarse',
 'panfila',
 'compañero',
 'ayunt',
 'protestar',
 'postre',
 'atmosferica',
 'merecia',
 'calderon',
 'noooo',
 'esperado',
 'vaeo',
 'leucemia',
 'concreto',
 'villaverde',
 'reixa',
 'formato',
 'mitologia',
 'camacho',
 'espanahay',
 'amo',
 'cresp

In [8]:
def count_tweets(tweets, ys):
    result = {}

    for y, tweet in zip(ys, tweets):
        for word in word_tokenize(tweet):
            pair = (word,y)

            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1

    return result

In [9]:
def lookup(freqs, word, label):
    n = 0  

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [10]:
def train_naive_bayes1(freqs, train_x, train_y):
    loglikelihood_pos = {}
    loglikelihood_neg = {}
    loglikelihood_neu = {}

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = N_neu = 0
    for pair in freqs.keys():
        if pair[1] == 2:
            N_pos += freqs[pair]
        elif pair[1] == 1:
            N_neg += freqs[pair]
        else:
            N_neu += freqs[pair]
    
    logprior_pos = np.log(N_pos / (N_pos + N_neg + N_neu))
    logprior_neg = np.log(N_neg / (N_pos + N_neg + N_neu))
    logprior_neu = np.log(N_neu / (N_pos + N_neg + N_neu))

    for word in vocab:
        freq_pos =lookup(freqs,word,2)
        freq_neg =lookup(freqs,word,1)
        freq_neu =lookup(freqs,word,0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        p_w_neu = (freq_neu + 1) / (N_neu + V)

        loglikelihood_pos[word] = np.log(p_w_pos) - np.log(p_w_neg + p_w_neu)
        loglikelihood_neg[word] = np.log(p_w_neg) - np.log(p_w_pos + p_w_neu)
        loglikelihood_neu[word] = np.log(p_w_neu) - np.log(p_w_pos + p_w_neg)

    return loglikelihood_pos, loglikelihood_neg, loglikelihood_neu, logprior_pos, logprior_neg, logprior_neu

In [11]:
def naive_bayes_predict1(tweet, logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos,loglikelihood_neg, loglikelihood_neu):
    word_l = word_tokenize(tweet)

    p_pos = logprior_pos
    p_neg = logprior_neg
    p_neu = logprior_neu

    for word in word_l:

        if word in loglikelihood_pos:
            p_pos += loglikelihood_pos[word]
        if word in loglikelihood_neg:
            p_neg += loglikelihood_neg[word]
        if word in loglikelihood_neu:
            p_neu += loglikelihood_neu[word]

    return {2: p_pos, 1: p_neg, 0: p_neu}


In [12]:
freqs1 = count_tweets(train.texto_normalizado, train.sentimiento)
loglikelihood_pos, loglikelihood_neg, loglikelihood_neu, logprior_pos, logprior_neg, logprior_neu = train_naive_bayes1(freqs1, train.texto_normalizado, train.sentimiento)

In [14]:
validation['prediccion'] = validation['texto_normalizado'].apply(lambda x: max(naive_bayes_predict1(x,logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos,loglikelihood_neg, loglikelihood_neu), key=naive_bayes_predict1(x, logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos,loglikelihood_neg, loglikelihood_neu).get))

In [15]:
validation

Unnamed: 0.1,Unnamed: 0,texto,sentimiento,texto_normalizado,prediccion
0,7450,@marianorajoy Estamos muy satisfechos,2,satisfecho,0
1,1332,Nada mejor que pasar la navidad con la familia...,2,mejor pasar navidad familia amigo uds pasar,2
2,6436,Planeta creativo: nuevo placer otra interesant...,2,planeta creativo nuevo placer interesante jorn...,2
3,8973,Necesito ir a mi hogar y dormir con mis dos hi...,1,necesitar ir hogar dormir dos hijo peludos kab...,1
4,803,En la XII edición Premios Culturas de Extremad...,2,xii edicion premio culturas extremadura galard...,2
...,...,...,...,...,...
3409,15367,Bajando a Calahorra ya no queda nada para el c...,2,bajar calahorra quedar cambio junto salir adel...,0
3410,10591,"Es que estos dolores, son bárbaros...",1,dolor barbaros,1
3411,2111,"#ChacónenLaSER : ""Me siento con capacidad y co...",2,chaconenlaser sentar capacidad equipo superman...,1
3412,14924,La Prima de Riesgo es 1 estafa de los Mercados...,0,prima riesgo estafa mercado bastar ppsoe basta...,1


In [16]:
validation[validation['sentimiento'] == validation['prediccion']].prediccion.value_counts()

prediccion
1    767
2    746
0    370
Name: count, dtype: int64

In [17]:
validation[validation['sentimiento'] != validation['prediccion']].sentimiento.value_counts()

sentimiento
0    724
1    415
2    383
Name: count, dtype: int64

In [18]:
# Matriz de confusión
cm = confusion_matrix(validation['sentimiento'], validation['prediccion'])

precision = precision_score(validation['sentimiento'], validation['prediccion'], average=None)
recall = recall_score(validation['sentimiento'], validation['prediccion'], average=None)
accuracy = accuracy_score(validation['sentimiento'], validation['prediccion'])

f1 = f1_score(validation['sentimiento'], validation['prediccion'], average=None)

print("Confusion Matrix:")
print(cm)
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)


Confusion Matrix:
[[370 374 350]
 [233 767 182]
 [184 199 746]]
Precision: [0.47013977 0.57238806 0.58372457]
Recall: [0.33820841 0.64890017 0.66076174]
Accuracy: 0.5530102790014684
F1 Score: [0.39340776 0.60824742 0.61985875]


## Modelo sin neutrales

In [19]:
train_nb = train[train['sentimiento'] != 0]
validation_nb = validation[validation['sentimiento'] != 0]

In [20]:
train_nb['sentimiento'] = train_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)
validation_nb['sentimiento'] = validation_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_nb['sentimiento'] = train_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_nb['sentimiento'] = validation_nb['sentimiento'].apply(lambda x: 1 if x == 2 else 0)


In [21]:
train_nb

Unnamed: 0.1,Unnamed: 0,texto,sentimiento,texto_normalizado
0,10221,"@Manuellflorod Bienvenida (triste) realidad, a...",0,bienvenida triste realidad andar mismo
1,9565,Estar en los brazos de mi novio es lo único qu...,1,brazo novio unico querer necesitar
3,7713,@ahorapodemos @ierrejon Tambien que las empres...,1,tambien empresa independiente contactar si ...
5,1038,El tono duro y sin concesiones de la réplica d...,0,tono duro concesión replica rajoy amaiur anunc...
7,10,Bdías. EM no se ira de puente. Si vosotros os ...,1,bdias em ira puente si ir dejeis llevar tablet...
...,...,...,...,...
10236,11285,"No, solo quiero bailarte la medusa loca en pri...",1,solo querer bailarte medusa loca privado
10237,11965,@cuervotinelli lo único que te puse fue que qu...,0,unico poner queria jugar lolo tortuga retwitte...
10238,5390,La Audiencia Nacional condena a 20 años de pri...,0,audiencia nacional condena año prision ex jefe...
10239,860,RT @eP_Titulares: #Sociedad Muere apuñalada la...,0,sociedad muere apuñalado propietario zapateria...


In [22]:
def lookup(freqs, word, label):
    n = 0  

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [23]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0

    vocab = palabras_set
    V = len(vocab)

    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    D = len(train_y)
    D_pos =np.sum(train_y)
    D_neg = D-D_pos

    logprior =np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos =lookup(freqs,word,1)
        freq_neg =lookup(freqs,word,0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

In [24]:
freqs = count_tweets(train_nb.texto_normalizado, train_nb.sentimiento)

In [25]:
logprior, loglikelihood = train_naive_bayes(freqs, train_nb.texto_normalizado, train_nb.sentimiento)

In [26]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = word_tokenize(tweet)
    p = 0
    p += logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]
    return p

In [27]:
my_tweet = 'pesimo horrible bueno hermoso'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)

In [28]:
p

-0.4543217422057779

In [29]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0  
    y_hats = []
    for tweet in test_x:
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        y_hats.append(y_hat_i)

    error = np.mean(np.absolute(y_hats-test_y))
    accuracy = 1-error

    return accuracy

In [30]:
test_naive_bayes(validation_nb.texto_normalizado, validation_nb.sentimiento, logprior, loglikelihood)

0.7767200346170489

In [31]:
validation_nb['prediccion'] = validation_nb.texto_normalizado.apply(lambda x: 1 if naive_bayes_predict(x, logprior, loglikelihood) > 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_nb['prediccion'] = validation_nb.texto_normalizado.apply(lambda x: 1 if naive_bayes_predict(x, logprior, loglikelihood) > 0 else 0)


In [32]:
validation_nb['puntaje'] = validation_nb.texto_normalizado.apply(lambda x: naive_bayes_predict(x, logprior, loglikelihood))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_nb['puntaje'] = validation_nb.texto_normalizado.apply(lambda x: naive_bayes_predict(x, logprior, loglikelihood))


In [33]:
validation_nb

Unnamed: 0.1,Unnamed: 0,texto,sentimiento,texto_normalizado,prediccion,puntaje
0,7450,@marianorajoy Estamos muy satisfechos,1,satisfecho,1,0.729572
1,1332,Nada mejor que pasar la navidad con la familia...,1,mejor pasar navidad familia amigo uds pasar,1,3.391953
2,6436,Planeta creativo: nuevo placer otra interesant...,1,planeta creativo nuevo placer interesante jorn...,1,7.864978
3,8973,Necesito ir a mi hogar y dormir con mis dos hi...,0,necesitar ir hogar dormir dos hijo peludos kab...,0,-1.473601
4,803,En la XII edición Premios Culturas de Extremad...,1,xii edicion premio culturas extremadura galard...,1,5.551393
...,...,...,...,...,...,...
3408,8116,@nataliaprzc Con Liberación se vive mejor,1,liberacion vivir mejor,0,-0.255210
3409,15367,Bajando a Calahorra ya no queda nada para el c...,1,bajar calahorra quedar cambio junto salir adel...,1,2.081062
3410,10591,"Es que estos dolores, son bárbaros...",0,dolor barbaros,0,-2.043017
3411,2111,"#ChacónenLaSER : ""Me siento con capacidad y co...",1,chaconenlaser sentar capacidad equipo superman...,1,0.208470


In [36]:
len(validation_nb[validation_nb['sentimiento'] == validation_nb['prediccion']])

1795

In [38]:

cm = confusion_matrix(validation_nb['sentimiento'], validation_nb['prediccion'])

precision = precision_score(validation_nb['sentimiento'], validation_nb['prediccion'], average=None)
recall = recall_score(validation_nb['sentimiento'], validation_nb['prediccion'], average=None)
accuracy = accuracy_score(validation_nb['sentimiento'], validation_nb['prediccion'])

f1 = f1_score(validation_nb['sentimiento'], validation_nb['prediccion'], average=None)

print("Confusion Matrix:")
print(cm)
print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)

Confusion Matrix:
[[926 256]
 [260 869]]
Precision: [0.78077572 0.77244444]
Recall: [0.78341794 0.76970771]
Accuracy: 0.7767200346170489
F1 Score: [0.78209459 0.77107365]
