In [3]:
import pandas as pd

# Cargar el dataset
df = pd.read_csv('Tweets.csv')

# Ver las primeras filas del dataset
df.head()


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
# Ver las columnas del dataset
df.columns

Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')

In [5]:
import re


# Reemplazar NaN por cadenas vacías en la columna 'text'
df['text'] = df['text'].fillna('')

# Definir la función de limpieza del texto
def clean_text(text):
    if not isinstance(text, str):  # Verifica si el valor es una cadena
        return ''  # Devuelve una cadena vacía si no lo es
    # Aquí puedes agregar las operaciones de limpieza (como eliminar caracteres especiales, convertir a minúsculas, etc.)
    text = text.lower()  # Convertir el texto a minúsculas
    text = re.sub(r'http\S+', '', text)  # Eliminar URLs
    text = re.sub(r'@\w+', '', text)  # Eliminar menciones (@usuario)
    text = re.sub(r'#\w+', '', text)  # Eliminar hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Eliminar caracteres no alfabéticos
    return text

# Aplicar la función de limpieza a la columna 'text'
df['cleaned_text'] = df['text'].apply(clean_text)

# Ver las primeras filas después de limpiar
print(df.head())



       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  \
0  I`d have responded, if I were going   neutral   
1                             Sooo SAD  negative   
2                          bullying me  negative   
3                       leave me alone  negative   
4                        Sons of ****,  negative   

                                        cleaned_text  
0                  id have responded if i were going  
1         sooo sad i will miss you here in san diego  
2                             my boss is bullying me  
3                      what interview leave me alone  
4  

In [6]:
import nltk
nltk.download('punkt', download_dir=r"C:\Users\admin\Documents\David\Proyecto_IA\nltk_data")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\Documents\David\Proyecto_IA\nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
nltk.download('punkt', download_dir=r"C:\Users\admin\Documents\David\Proyecto_IA\nltk_data")
nltk.download('stopwords', download_dir=r"C:\Users\admin\Documents\David\Proyecto_IA\nltk_data")
nltk.download('wordnet', download_dir=r"C:\Users\admin\Documents\David\Proyecto_IA\nltk_data")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\Documents\David\Proyecto_IA\nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\Documents\David\Proyecto_IA\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\Documents\David\Proyecto_IA\nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
print(nltk.data.path)



['C:\\Users\\admin/nltk_data', 'c:\\Python313\\nltk_data', 'c:\\Python313\\share\\nltk_data', 'c:\\Python313\\lib\\nltk_data', 'C:\\Users\\admin\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [9]:
import nltk
import os
ruta_personalizada = "C:/Users/admin/Documents/David/Proyecto_IA/nltk_data"
nltk.download('punkt_tab', download_dir=ruta_personalizada)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:/Users/admin/Documents/David/Proyecto_IA/nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
nltk.data.path.clear()
nltk.data.path.append(ruta_personalizada)

# 3. Verifica que nltk "vea" el archivo
try:
    from nltk.data import find
    print("NLTK encontró:", find('tokenizers/punkt/english.pickle'))
except LookupError:
    print("No se encontró punkt. Intentando descargarlo...")

    # 4. Descargar si no se encuentra
    nltk.download('punkt', download_dir=ruta_personalizada)


    print("Reintentando...")
    from nltk.tokenize import word_tokenize
    text = "I`d have responded, if I were going"
    tokens = word_tokenize(text)
    print("Tokens:", tokens)
else:

    from nltk.tokenize import word_tokenize
    text = "I`d have responded, if I were going"
    tokens = word_tokenize(text)
    print("Tokens:", tokens)





NLTK encontró: C:\Users\admin\Documents\David\Proyecto_IA\nltk_data\tokenizers\punkt\english.pickle
Tokens: ['I', '`', 'd', 'have', 'responded', ',', 'if', 'I', 'were', 'going']


In [11]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Inicializar stopwords y lematizador
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Función para procesar texto: tokenizar, quitar stopwords y lematizar
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

# Aplicar al DataFrame
df['lemmatized_text'] = df['cleaned_text'].apply(preprocess_text)

# Mostrar los resultados
print(df[['text', 'cleaned_text', 'lemmatized_text']].head())


                                                text  \
0                I`d have responded, if I were going   
1      Sooo SAD I will miss you here in San Diego!!!   
2                          my boss is bullying me...   
3                     what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on t...   

                                        cleaned_text  \
0                  id have responded if i were going   
1         sooo sad i will miss you here in san diego   
2                             my boss is bullying me   
3                      what interview leave me alone   
4   sons of  why couldnt they put them on the rel...   

                          lemmatized_text  
0                      id responded going  
1                 sooo sad miss san diego  
2                            bos bullying  
3                   interview leave alone  
4  son couldnt put release already bought  


In [None]:
import random

# Crear una lista de (texto, etiqueta)
data = list(zip(df['lemmatized_text'], df['sentiment']))

# Mezclar aleatoriamente
random.shuffle(data)

# 80% entrenamiento, 20% prueba
split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
test_data = data[split_idx:]


In [34]:
from collections import defaultdict
import math

# Inicializar estructuras
class_counts = defaultdict(int)
word_counts = defaultdict(lambda: defaultdict(int))
vocab = set()

# Entrenar
for text, label in train_data:
    class_counts[label] += 1
    words = text.split()
    for word in words:
        word_counts[label][word] += 1
        vocab.add(word)

# Calcular total de documentos
total_docs = sum(class_counts.values())

# Calcular probabilidades log(palabra|clase) con Laplace
log_probs = {}
class_priors = {}

for label in class_counts:
    total_words = sum(word_counts[label].values())
    class_priors[label] = math.log(class_counts[label] / total_docs)
    log_probs[label] = {}

    for word in vocab:
        word_freq = word_counts[label][word] + 1  # Laplace
        log_probs[label][word] = math.log(word_freq / (total_words + len(vocab)))


In [35]:
def predict(text):
    words = text.split()
    scores = {}

    for label in class_priors:
        scores[label] = class_priors[label]
        for word in words:
            if word in vocab:
                scores[label] += log_probs[label].get(word, math.log(1 / (sum(word_counts[label].values()) + len(vocab))))
    
    return max(scores, key=scores.get)


In [36]:
correct = 0
total = len(test_data)

for text, label in test_data:
    prediction = predict(text)
    if prediction == label:
        correct += 1

accuracy = correct / total
print(f"Precisión del modelo Naive Bayes: {accuracy:.2f}")


Precisión del modelo Naive Bayes: 0.64


In [32]:
df['sentiment'].value_counts()


sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

In [None]:
import random
import math
from collections import defaultdict

# Lista de stopwords (puedes agregar más palabras si lo deseas)
stopwords = set(["the", "and", "is", "to", "it", "of", "a", "in", "that", "for", "on", "with", "as", "was", "at", "by"])

# Función para limpiar el texto (eliminar stopwords y convertir a minúsculas)
def clean_text(text):
    words = text.split()
    return ' '.join([word.lower() for word in words if word not in stopwords])

# Cargar el dataset (deberías cargar tu archivo CSV aquí)
df = pd.read_csv('Tweets.csv')

# Preprocesar los datos
df['cleaned_text'] = df['text'].apply(clean_text)
df['lemmatized_text'] = df['cleaned_text']  # Aquí podrías aplicar lematización si lo deseas

# Crear una lista de (texto, etiqueta)
data = list(zip(df['lemmatized_text'], df['sentiment']))

# Mezclar aleatoriamente
random.shuffle(data)

# 80% entrenamiento, 20% prueba
split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
test_data = data[split_idx:]

# Inicializar estructuras
class_counts = defaultdict(int)
word_counts = defaultdict(lambda: defaultdict(int))
vocab = set()

# Entrenar
for text, label in train_data:
    class_counts[label] += 1
    words = text.split()
    for word in words:
        word_counts[label][word] += 1
        vocab.add(word)

# Calcular total de documentos
total_docs = sum(class_counts.values())

# Calcular probabilidades log(palabra|clase) con Laplace
log_probs = {}
class_priors = {}

for label in class_counts:
    total_words = sum(word_counts[label].values())
    class_priors[label] = math.log(class_counts[label] / total_docs)
    log_probs[label] = {}

    for word in vocab:
        word_freq = word_counts[label][word] + 1  # Laplace
        log_probs[label][word] = math.log(word_freq / (total_words + len(vocab)))

def predict(text):
    words = text.split()
    scores = {}

    for label in class_priors:
        scores[label] = class_priors[label]
        for word in words:
            if word in vocab:
                scores[label] += log_probs[label].get(word, math.log(1 / (sum(word_counts[label].values()) + len(vocab))))

        # Ajusta por el desequilibrio de clases
        if class_counts[label] < total_docs * 0.2:  # Para clases minoritarias
            scores[label] *= 1.5  # Aumenta el peso de las clases minoritarias

    return max(scores, key=scores.get)

# Evaluación del modelo
correct = 0
total = len(test_data)

for text, label in test_data:
    prediction = predict(text)
    if prediction == label:
        correct += 1

accuracy = correct / total
print(f"Precisión del modelo Naive Bayes: {accuracy:.2f}")



Precisión del modelo Naive Bayes: 0.64
