# FIFA World Cup 2022 Tweets - Limpieza y Preprocesamiento con NLTK

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')  # Esto normalmente no es necesario, pero lo forzamos por el error

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
import pandas as pd
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Descargar recursos
nltk.download('punkt')
nltk.download('stopwords')

# Leer CSV
df = pd.read_csv(
    "fifa_world_cup_2022_tweets.csv",
    encoding="ISO-8859-1",
    sep=",",
    quoting=csv.QUOTE_MINIMAL,
    on_bad_lines="skip"
)

# Renombrar columnas
df.columns = ['index', 'Date Created', 'Number of Likes', 'Source of Tweet', 'Tweet', 'Sentiment_raw']

# Extraer sentimiento
df['Sentiment'] = df['Sentiment_raw'].str.extract(r'(positive|neutral|negative)', expand=False)
df = df.drop(columns=['index', 'Sentiment_raw'])

# Configurar stopwords
stop_words = set(stopwords.words('english'))

# Función de limpieza
def clean_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t.isalpha()]
        tokens = [t for t in tokens if t not in stop_words]
        return " ".join(tokens)
    return ""

# Aplicar limpieza
df['clean_tweet'] = df['Tweet'].apply(clean_text)

# Vista previa
print(df[['clean_tweet', 'Sentiment']].sample(5))

# Guardar CSV limpio
df.to_csv("fifa_tweets_clean.csv", index=False)
print("✅ Archivo limpio guardado como fifa_tweets_clean.csv")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


      clean_tweet Sentiment
26716                   NaN
34181                   NaN
7981                    NaN
3688                    NaN
48879                   NaN
✅ Archivo limpio guardado como fifa_tweets_clean.csv
