In [8]:
### 1.	Text Cleaning

In [11]:
import re
import string
from bs4 import BeautifulSoup
import pandas as pd
import os

train_df = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Combined/combined_train.csv")
dev_df   = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Combined/combined_dev.csv")
test_df  = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Combined/combined_test.csv")

punct_to_remove = string.punctuation.replace("#", "").replace("@", "")
# preserve # and @

def clean_text(text):
    if pd.isnull(text):
        return ""  
    text = text.lower()  
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', punct_to_remove))  
    text = re.sub(r'\s+', ' ', text)
    text = BeautifulSoup(text, "html.parser").get_text() 
    return text.strip()

train_df["tweet"] = train_df["tweet"].apply(clean_text)
dev_df["tweet"]   = dev_df["tweet"].apply(clean_text)
test_df["tweet"]  = test_df["tweet"].apply(clean_text)


output_folder = "C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/cleaned_data"
os.makedirs(output_folder, exist_ok=True)

train_df.to_csv(os.path.join(output_folder, "cleaned_train.csv"), index=False)
dev_df.to_csv(os.path.join(output_folder, "cleaned_dev.csv"), index=False)
test_df.to_csv(os.path.join(output_folder, "cleaned_test.csv"), index=False)


print(train_df.head())


                                               tweet     label
0                        kako be shark but wo ti ewu  negative
1            br ne bayie nti na me supporti man city  negative
2  s woofis mada wafutuo tantan no ywo smafa dabi...  negative
3          wabɔdam anaa wo trumu yɛ nkate nkwan aseɛ  negative
4                                  enfa bi da bra 🤣🤣  negative


In [2]:
### 2.  Tokenization 

In [12]:
import re

train_df = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/cleaned_data/cleaned_train.csv")
dev_df   = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/cleaned_data/cleaned_dev.csv")
test_df  = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/cleaned_data/cleaned_test.csv")

def twi_tokenize(text):
    """
    Custom tokenizer for Twi-English code-switched tweets.
    Preserves Twi diacritics, emojis, hashtags, and @mentions.
    """
    # Regex explanation:
    # - @\w+ → mentions
    # - #\w+ → hashtags
    # - \w+ → words (keeps ɛ, ɔ, etc.)
    # - [^\w\s] → emojis or special symbols
    tokens = re.findall(r"@\w+|#\w+|\w+|[^\w\s]", text, re.UNICODE)
    return tokens

# Apply to datasets
train_df["tokens"] = train_df["tweet"].apply(twi_tokenize)
dev_df["tokens"]   = dev_df["tweet"].apply(twi_tokenize)
test_df["tokens"]  = test_df["tweet"].apply(twi_tokenize)


output_folder = "C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/tokenized_data"
os.makedirs(output_folder, exist_ok=True)

train_df.to_csv(os.path.join(output_folder, "tokenized_train.csv"), index=False)
dev_df.to_csv(os.path.join(output_folder, "tokenized_dev.csv"), index=False)
test_df.to_csv(os.path.join(output_folder, "tokenized_test.csv"), index=False)


print(train_df[["tweet", "tokens"]].head())


                                               tweet  \
0                        kako be shark but wo ti ewu   
1            br ne bayie nti na me supporti man city   
2  s woofis mada wafutuo tantan no ywo smafa dabi...   
3          wabɔdam anaa wo trumu yɛ nkate nkwan aseɛ   
4                                  enfa bi da bra 🤣🤣   

                                              tokens  
0                [kako, be, shark, but, wo, ti, ewu]  
1  [br, ne, bayie, nti, na, me, supporti, man, city]  
2  [s, woofis, mada, wafutuo, tantan, no, ywo, sm...  
3  [wabɔdam, anaa, wo, trumu, yɛ, nkate, nkwan, a...  
4                          [enfa, bi, da, bra, 🤣, 🤣]  


In [13]:
### 3.	Stop Words Removal 

In [18]:
import pandas as pd
import os
import ast
from nltk.corpus import stopwords
import nltk

# Download English stopwords if not already
nltk.download('stopwords')

# English + Twi stopwords
english_stopwords = set(stopwords.words("english"))

twi_stopwords = {
    "na", "ne", "yɛ", "wo", "de", "nso", "a", "ɔno", "me", "mo",
    "yɛn", "woara", "ɔnoa", "ene", "ɛno", "wɔn", "se", "di", "ni",
    "no", "ka", "kɔ", "ba", "yɛɛ", "yɛ", "y?", "da", "re", "aa",
    "wɔ", "ɛ", "mu", "ho", "bi"
}

combined_stopwords = english_stopwords.union(twi_stopwords)

# Load datasets
train_df = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/tokenized_data/tokenized_train.csv")
dev_df   = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/tokenized_data/tokenized_dev.csv")
test_df  = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/tokenized_data/tokenized_test.csv")

# Fix: Convert stringified lists back to real lists
for df in [train_df, dev_df, test_df]:
    df["tokens"] = df["tokens"].apply(ast.literal_eval)

# Apply stopword removal
train_df["filtered_tokens"] = train_df["tokens"].apply(
    lambda doc: [word for word in doc if word not in combined_stopwords]
)
dev_df["filtered_tokens"] = dev_df["tokens"].apply(
    lambda doc: [word for word in doc if word not in combined_stopwords]
)
test_df["filtered_tokens"] = test_df["tokens"].apply(
    lambda doc: [word for word in doc if word not in combined_stopwords]
)

# Save filtered datasets (with safer serialization of lists)
output_folder = "C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/stop_words_removed"
os.makedirs(output_folder, exist_ok=True)

train_df.to_csv(os.path.join(output_folder, "filtered_train.csv"), index=False)
dev_df.to_csv(os.path.join(output_folder, "filtered_dev.csv"), index=False)
test_df.to_csv(os.path.join(output_folder, "filtered_test.csv"), index=False)

# Quick check
print(train_df[["tokens", "filtered_tokens"]].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\T-Plug\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              tokens  \
0                [kako, be, shark, but, wo, ti, ewu]   
1  [br, ne, bayie, nti, na, me, supporti, man, city]   
2  [s, woofis, mada, wafutuo, tantan, no, ywo, sm...   
3  [wabɔdam, anaa, wo, trumu, yɛ, nkate, nkwan, a...   
4                          [enfa, bi, da, bra, 🤣, 🤣]   

                                     filtered_tokens  
0                             [kako, shark, ti, ewu]  
1              [br, bayie, nti, supporti, man, city]  
2  [woofis, mada, wafutuo, tantan, ywo, smafa, da...  
3         [wabɔdam, anaa, trumu, nkate, nkwan, aseɛ]  
4                                  [enfa, bra, 🤣, 🤣]  


In [14]:
### 4.	Stemming and Lemmatization 
# will be skipping as Twi has no official morphological analyzer in NLTK/spacy, 
# many papers just skip stemming/lemmatization and keep the filtered tokens.

In [6]:
### 5.	Handling Emojis and Emoticons 
# will be skipping demojizing for maximum performance, maximum performance → keep raw emoji, since the most nlp models can handle emojis well

In [None]:
### 6.	Spell Checking 
#  Many Twi NLP pipelines skip spellchecking because of limited resources.
#  Social media text in Twi often has slang, code-switching, intentional misspellings → auto-correcting can damage meaning.