In [6]:
import pandas as pd 
import numpy as np 
import string
import re

In [48]:
#lectura del csv
DATA_FILE = 'Travel Tweets.csv'
data = pd.read_csv(DATA_FILE)
data.head(n=10)

Unnamed: 0,Tweet
0,Did the rainy cook really turn the travel? #大连...
1,4249570🥳🥳inside travel simply \n#颜色陪玩 #女喘 #文援 ...
2,Astounding -- 98% from one religion in FBI's t...
3,"I’m sorry, I’m a little bit high"
4,It's fascinating - I guess not too surprising ...
5,"""The core measure the RBA looks at is the ‘exc..."
6,A1: On the First Weekend of Summer We Took the...
7,Cheap Flights: Dallas to Madrid $586-$599 r/t ...
8,@NISAmerica @garrrzzz *Googles how to harness ...
9,"Let a 30 year old be a magical girl, let the m..."


In [49]:
data.shape

(200, 1)

In [50]:
# Text normalization / remove special characters / emojis 
def remove_emoji(string):
        emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string) 


def clean_tweet(text):
    if type(text) == float:
        return ""
    temp = text.lower() #to lowercase
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)#@user
    temp = re.sub(r'@\w+', '', temp) #delete usernames mentions
    temp = re.sub("#","", temp)#hashtag
    temp = remove_emoji(temp)
    temp = re.sub(r'[^\w\s\d]', '', temp) #remove special characters
    temp = re.sub(r'(.)\1+', r'\1\1', temp)#duplicated
    temp = re.sub(r'http\S+', '', temp)#url
    temp = re.sub('[()!?]', ' ', temp)#parenthesis
    temp = re.sub('\[.*?\]',' ', temp)#square brackets
    temp = re.sub("[^a-z0-9]"," ", temp)#only alphanumeric
    temp = temp.strip() #extra white spaces
    return temp

data['Tweet'] = data['Tweet'].map(lambda x : clean_tweet(x))

In [51]:
data.head(n=10)

Unnamed: 0,Tweet
0,did the rainy cook really turn the travel
1,4249570inside travel simply
2,astounding 98 from one religion in fbis trave...
3,im sorry im a little bit high
4,its fascinating i guess not too surprising fo...
5,the core measure the rba looks at is the excl ...
6,a1 on the first weekend of summer we took the ...
7,cheap flights dallas to madrid 586599 rt augno...
8,googles how to harness orbal energy to time tr...
9,let a 30 year old be a magical girl let the mi...


In [52]:
#Spell-Checking  ---pip install pyspellchecker--- most heavy cell bc the ammount of tweets
from spellchecker import SpellChecker

# Crear una instancia del corrector ortográfico
spell = SpellChecker()

def corrected_tweet(tweet):
    # Corrección ortográfica del tweet
    corrected_tweet = [spell.correction(token) for token in data['Tweet']]
    return corrected_tweet

data['Tweet'] = data['Tweet'].apply(corrected_tweet)

data.head(10)

KeyboardInterrupt: 

In [53]:
#Tokenization and stemming
import nltk
from nltk.corpus import stopwords #list of void words
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

# Descargar la lista de palabras vacías en inglés
nltk.download('stopwords')
nltk.download('punkt')

# Tokenizador específico para tweets
tokenizer = TweetTokenizer()
# Stemmer para inglés
stemmer = PorterStemmer()
# Obtener la lista de palabras vacías en inglés
stopwords_en = set(stopwords.words('english'))

# Tokenizar y filtrar palabras vacías en los tweets
def token_tweet(tweet):
    tokens = tokenizer.tokenize(tweet)
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords_en]
    
    return filtered_tokens

#Stemming 
def stemmer_token(filtered_tokens):
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return filtered_tokens,stemmed_tokens

# Aplicar el procesamiento a la columna 'Tweet' y guardar los tokens en 'tokens'
data['tokens'] = data['Tweet'].apply(token_tweet)
data['stemmed_tokens'] = data['tokens'].apply(stemmer_token)
# Imprimir el DataFrame resultante
data.head(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Tweet,tokens,stemmed_tokens
0,did the rainy cook really turn the travel,"[rainy, cook, really, turn, travel]","([rainy, cook, really, turn, travel], [raini, ..."
1,4249570inside travel simply,"[4249570, inside, travel, simply]","([4249570, inside, travel, simply], [4249570, ..."
2,astounding 98 from one religion in fbis trave...,"[astounding, 98, one, religion, fbis, travel, ...","([astounding, 98, one, religion, fbis, travel,..."
3,im sorry im a little bit high,"[im, sorry, im, little, bit, high]","([im, sorry, im, little, bit, high], [im, sorr..."
4,its fascinating i guess not too surprising fo...,"[fascinating, guess, surprising, international...","([fascinating, guess, surprising, internationa..."
5,the core measure the rba looks at is the excl ...,"[core, measure, rba, looks, excl, fuel, fruit,...","([core, measure, rba, looks, excl, fuel, fruit..."
6,a1 on the first weekend of summer we took the ...,"[a1, first, weekend, summer, took, south, bass...","([a1, first, weekend, summer, took, south, bas..."
7,cheap flights dallas to madrid 586599 rt augno...,"[cheap, flights, dallas, madrid, 586599, rt, a...","([cheap, flights, dallas, madrid, 586599, rt, ..."
8,googles how to harness orbal energy to time tr...,"[googles, harness, orbal, energy, time, travel...","([googles, harness, orbal, energy, time, trave..."
9,let a 30 year old be a magical girl let the mi...,"[let, 30, year, old, magical, girl, let, middl...","([let, 30, year, old, magical, girl, let, midd..."


In [54]:
data['is_relevant'] = True

# Lista de palabras clave
keywords = ['offer', 'destination', 'guide', 'discount', 'package', 'sale']

# Función para verificar si un tweet es relevante basado en palabras clave
def tweet_relevant(tweet):
    for keyword in keywords:
        if keyword in tweet:
            return False
    return True

# Marca los tweets relevantes como True en la columna 'is_relevant'
data['is_relevant'] = data['Tweet'].apply(tweet_relevant)

# Filtra los tweets relevantes
relevant_tweets = data[data['is_relevant']]


# Imprime los tweets relevantes
data.head(10)

Unnamed: 0,Tweet,tokens,stemmed_tokens,is_relevant
0,did the rainy cook really turn the travel,"[rainy, cook, really, turn, travel]","([rainy, cook, really, turn, travel], [raini, ...",True
1,4249570inside travel simply,"[4249570, inside, travel, simply]","([4249570, inside, travel, simply], [4249570, ...",True
2,astounding 98 from one religion in fbis trave...,"[astounding, 98, one, religion, fbis, travel, ...","([astounding, 98, one, religion, fbis, travel,...",True
3,im sorry im a little bit high,"[im, sorry, im, little, bit, high]","([im, sorry, im, little, bit, high], [im, sorr...",True
4,its fascinating i guess not too surprising fo...,"[fascinating, guess, surprising, international...","([fascinating, guess, surprising, internationa...",True
5,the core measure the rba looks at is the excl ...,"[core, measure, rba, looks, excl, fuel, fruit,...","([core, measure, rba, looks, excl, fuel, fruit...",True
6,a1 on the first weekend of summer we took the ...,"[a1, first, weekend, summer, took, south, bass...","([a1, first, weekend, summer, took, south, bas...",True
7,cheap flights dallas to madrid 586599 rt augno...,"[cheap, flights, dallas, madrid, 586599, rt, a...","([cheap, flights, dallas, madrid, 586599, rt, ...",True
8,googles how to harness orbal energy to time tr...,"[googles, harness, orbal, energy, time, travel...","([googles, harness, orbal, energy, time, trave...",True
9,let a 30 year old be a magical girl let the mi...,"[let, 30, year, old, magical, girl, let, middl...","([let, 30, year, old, magical, girl, let, midd...",True


In [55]:
data.shape

(200, 4)

In [56]:
#deleting irrelevant tweets
data = data[data['is_relevant'] == True]
data.shape

(198, 4)

In [61]:
# Crear pares de entrada-salida
# Seleccionar las columnas relevantes
datainputs = data[["Tweet", "stemmed_tokens"]]

# Renombrar las columnas
datainputs.columns = ["input_text", "output_tokens"]


# Crear una lista de pares de entrada-salida
pairs = datainputs.values.tolist()

# Visualizar los primeros 5 pares
datainputs.head(10)







Unnamed: 0,input_text,output_tokens
0,did the rainy cook really turn the travel,"([rainy, cook, really, turn, travel], [raini, ..."
1,4249570inside travel simply,"([4249570, inside, travel, simply], [4249570, ..."
2,astounding 98 from one religion in fbis trave...,"([astounding, 98, one, religion, fbis, travel,..."
3,im sorry im a little bit high,"([im, sorry, im, little, bit, high], [im, sorr..."
4,its fascinating i guess not too surprising fo...,"([fascinating, guess, surprising, internationa..."
5,the core measure the rba looks at is the excl ...,"([core, measure, rba, looks, excl, fuel, fruit..."
6,a1 on the first weekend of summer we took the ...,"([a1, first, weekend, summer, took, south, bas..."
7,cheap flights dallas to madrid 586599 rt augno...,"([cheap, flights, dallas, madrid, 586599, rt, ..."
8,googles how to harness orbal energy to time tr...,"([googles, harness, orbal, energy, time, trave..."
9,let a 30 year old be a magical girl let the mi...,"([let, 30, year, old, magical, girl, let, midd..."


In [62]:
#data split
from sklearn.model_selection import train_test_split
#train =70%
#test = 15%
# val = 15%

# División en conjunto de entrenamiento, validación y prueba
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Verificar las dimensiones de los conjuntos
print("Tamaño del conjunto de entrenamiento:", train_data.shape[0])
print("Tamaño del conjunto de validación:", val_data.shape[0])
print("Tamaño del conjunto de prueba:", test_data.shape[0])

Tamaño del conjunto de entrenamiento: 138
Tamaño del conjunto de validación: 30
Tamaño del conjunto de prueba: 30
