In [45]:
# Install dependencies as needed:
%pip install kagglehub[pandas-datasets]
%pip install polars
%pip install colorstreak
%pip install nltk



In [46]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import polars as pl
from colorstreak import Logger as log



# Set the path to the file you'd like to load
file_path = "elon_musk_tweets.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "gpreda/elon-musk-tweets",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)



# Convert to Polars DataFrame
lazy_frame_elon = (
    pl.from_pandas(df)
    .lazy()
)






  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'elon-musk-tweets' dataset.


In [47]:
df_frame_test = lazy_frame_elon.limit(5).collect()
columndas = df_frame_test.columns
log.info(f"Columns in the DataFrame: {columndas}")
log.info(f"First 5 rows of the DataFrame:\n{df_frame_test['text']}")


for tweet in df_frame_test['text']:
    log.info(f"Tweet: {tweet}")


[94m[INFO] Columns in the DataFrame: ['id', 'user_name', 'user_location', 'user_description', 'user_created', 'user_followers', 'user_friends', 'user_favourites', 'user_verified', 'date', 'text', 'hashtags', 'source', 'retweets', 'favorites', 'is_retweet'][0m
[94m[INFO] First 5 rows of the DataFrame:
shape: (5,)
Series: 'text' [str]
[
	"@BillyM2k I find the gold toe …
	"Sock Con, the conference for s…
	"Always something new for the m…
	"@ExplainThisBob This guy gets …
	"Sock tech is so advanced that …
][0m
[94m[INFO] Tweet: @BillyM2k I find the gold toe sock – inevitably off kilter &amp; washed out – a little troubling esthetically &amp; arguably a bit corpo[0m
[94m[INFO] Tweet: Sock Con, the conference for socks[0m
[94m[INFO] Tweet: Always something new for the magazine cover and the articles practically write themselves[0m
[94m[INFO] Tweet: @ExplainThisBob This guy gets it[0m
[94m[INFO] Tweet: Sock tech is so advanced that you can get pretty much anything in sock form th

In [48]:
from nltk.tokenize import TweetTokenizer
import re


def limpiart_tweet(tweet, mentions=True, hashtags=True, special_chars=True, urls=True):
    # Remove URLs
    if urls:
        tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove mentions
    if mentions:
        tweet = re.sub(r'@\w+', '', tweet)
    # Remove hashtags
    if hashtags:
        tweet = re.sub(r'#\w+', '', tweet)
    # Remove special characters
    if special_chars:
        tweet = re.sub(r'[^A-Za-z0-9\s]+', '', tweet)
    tweet = tweet.lower()
    return tweet.strip()


def tokenize_tweet(tweet, mentions=True, hashtags=True, special_chars=True, urls=True):
    tokenizer = TweetTokenizer()
    tweet_limpio = limpiart_tweet(tweet, mentions=mentions, hashtags=hashtags, special_chars=special_chars, urls=urls)
    tokens = tokenizer.tokenize(tweet_limpio)
    return tokens

In [49]:
for tweet in df_frame_test['text']:
    tokens = tokenize_tweet(tweet, urls=False)
    log.info(f"Tokens: {tokens}")

[94m[INFO] Tokens: ['i', 'find', 'the', 'gold', 'toe', 'sock', 'inevitably', 'off', 'kilter', 'amp', 'washed', 'out', 'a', 'little', 'troubling', 'esthetically', 'amp', 'arguably', 'a', 'bit', 'corpo'][0m
[94m[INFO] Tokens: ['sock', 'con', 'the', 'conference', 'for', 'socks'][0m
[94m[INFO] Tokens: ['always', 'something', 'new', 'for', 'the', 'magazine', 'cover', 'and', 'the', 'articles', 'practically', 'write', 'themselves'][0m
[94m[INFO] Tokens: ['this', 'guy', 'gets', 'it'][0m
[94m[INFO] Tokens: ['sock', 'tech', 'is', 'so', 'advanced', 'that', 'you', 'can', 'get', 'pretty', 'much', 'anything', 'in', 'sock', 'form', 'these', 'days'][0m


In [50]:
tweets_elon = (
    lazy_frame_elon
    .with_columns(
        pl.col("text")
        .map_elements(lambda tweet: tokenize_tweet(tweet, urls=False))
        .alias("tokens")
    )
    .collect()
    .get_column("tokens")
    .to_list()
)

for i, tweet in enumerate(tweets_elon):
    log.info(f"[{i+1}]Tokens: {tweet}")

  .collect()


[94m[INFO] [1]Tokens: ['i', 'find', 'the', 'gold', 'toe', 'sock', 'inevitably', 'off', 'kilter', 'amp', 'washed', 'out', 'a', 'little', 'troubling', 'esthetically', 'amp', 'arguably', 'a', 'bit', 'corpo'][0m
[94m[INFO] [2]Tokens: ['sock', 'con', 'the', 'conference', 'for', 'socks'][0m
[94m[INFO] [3]Tokens: ['always', 'something', 'new', 'for', 'the', 'magazine', 'cover', 'and', 'the', 'articles', 'practically', 'write', 'themselves'][0m
[94m[INFO] [4]Tokens: ['this', 'guy', 'gets', 'it'][0m
[94m[INFO] [5]Tokens: ['sock', 'tech', 'is', 'so', 'advanced', 'that', 'you', 'can', 'get', 'pretty', 'much', 'anything', 'in', 'sock', 'form', 'these', 'days'][0m
[94m[INFO] [6]Tokens: ['i', 'must', 'confess', 'to', 'a', 'penchant', 'for', 'creative', 'socks'][0m
[94m[INFO] [7]Tokens: ['its', 'time'][0m
[94m[INFO] [8]Tokens: ['his', 'success', 'was', 'in', 'fact', 'due', 'in', 'part', 'because', 'he', 'was', 'super', 'fun', 'at', 'parties', 'spoke', 'and', 'wrote', 'incredibly', 'wel