In [1]:
# !pip install demoji

In [4]:
# import nltk
# nltk.download("stopwords")
# nltk.download("wordnet")

import polars as pl
from nltk.corpus import stopwords
import re
import demoji

ModuleNotFoundError: ignored

In [None]:
# Load Stopwords and the dataset
ensw = stopwords.words('english')

df = pl.read_csv("dataset.csv")

In [None]:
# Clean ID Part
df = df.select([
    pl.col("ID").apply(lambda id: id.split("-")[1]).cast(pl.Int32),
    pl.exclude("ID")
])

In [None]:
# Remove rows without tweet AND tweet URL
df = df.filter(pl.col("Tweet").is_not_null() & pl.col("Tweet URL").is_not_null())

In [None]:
# Include only necessary columns
df = df.select(
    pl.col("ID"),
    pl.col("Tweet"),
    pl.col("Tweet Translated").alias("Translated")
)

In [None]:
# Change tweet case to lowercase
df = df.select(
    pl.all(),
    pl.col("Translated").apply(lambda tweet: tweet.lower()).alias("Clean")
)

In [None]:
# Remove hashtags
df = df.select(
    pl.exclude("Clean"),
    pl.col("Clean").apply(lambda tweet: re.sub("#(\w+)", '', tweet))
)

In [None]:
# Replace all emojis into interpretation
def emoji_to_word(tweet):
  for symbol, interpretation in demoji.findall(tweet).items():
    interpretation = interpretation.lower()
    # Turn flag: Philippines into flagphilippines 
    interpretation = re.sub('[^0-9a-z]+', '', interpretation)
    # replace all emojis to "emojiinterpretation "
    tweet = re.sub(symbol, interpretation+' ', tweet)
  return tweet

df = df.select(
    pl.exclude("Clean"),
    pl.col("Clean").apply(emoji_to_word)
)

In [None]:
# Remove non alphanumeric characters
df = df.select(
    pl.exclude("Clean"),
    pl.col("Clean").apply(lambda tweet: re.sub('[^0-9a-z]+', ' ', tweet))
)

In [None]:
# Cast Tweets to word array instead of long string.
df = df.select(
    pl.all(),
    pl.col("Clean").apply(lambda tweet: tweet.split()).cast(pl.List(str)).alias("Tokenized")
)

In [3]:
# Strip english stopwords
df = df.select(
    pl.all(),
    pl.col("Tokenized").arr.eval(pl.element().filter(~pl.element().is_in(ensw)), parallel=True).alias("Stopwords Removed")
)

NameError: ignored

In [None]:
# Stem and Lemmatize.
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df = df.select(
    pl.all(),
    pl.col("Stopwords Removed").apply(lambda words: [stemmer.stem(word) for word in words.to_list()]).alias("Stemmed"),
    pl.col("Stopwords Removed").apply(lambda words: [lemmatizer.lemmatize(word) for word in words.to_list()]).alias("Lemmatized")
)

In [None]:
print(df)
def serialize(arr):
  return " ".join(arr)
df_serialized = df.select(
    pl.col("ID"),
    pl.col("Tweet"),
    pl.col("Translated"),
    pl.col("Clean"),
    pl.col("Tokenized").apply(serialize).cast(str),
    pl.col("Stopwords Removed").apply(serialize).cast(str),
    pl.col("Stemmed").apply(serialize).cast(str),
    pl.col("Lemmatized").apply(serialize).cast(str)
)
df_serialized.write_csv("clean.csv")

shape: (153, 8)
┌─────┬─────────────┬─────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ ID  ┆ Tweet       ┆ Translated  ┆ Clean      ┆ Tokenized  ┆ Stopwords  ┆ Stemmed    ┆ Lemmatized │
│ --- ┆ ---         ┆ ---         ┆ ---        ┆ ---        ┆ Removed    ┆ ---        ┆ ---        │
│ i32 ┆ str         ┆ str         ┆ str        ┆ list[str]  ┆ ---        ┆ list[str]  ┆ list[str]  │
│     ┆             ┆             ┆            ┆            ┆ list[str]  ┆            ┆            │
╞═════╪═════════════╪═════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
│ 1   ┆ Cory        ┆ Cory        ┆ cory       ┆ ["cory",   ┆ ["cory",   ┆ ["cori",   ┆ ["cory",   │
│     ┆ Aquino:     ┆ Aquino:     ┆ aquino     ┆ "aquino",  ┆ "aquino",  ┆ "aquino",  ┆ "aquino",  │
│     ┆ - Duly      ┆ - Duly      ┆ duly       ┆ … "fact"]  ┆ … "fact"]  ┆ … "fact"]  ┆ … "fact"]  │
│     ┆ elected Pr… ┆ elected     ┆ elected    ┆            ┆            