### Combine all datasets

In [12]:
# if they end in csv

import pandas as pd
import glob

folder = "C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/original_data"
all_files = glob.glob(folder + "/*.csv")

df_list = []

for f in all_files:
    try:
        temp = pd.read_csv(f, encoding="utf-8")  
        
        print(f"Loaded {f} with UTF-8")
    except UnicodeDecodeError as e:
        print(f"Encoding error in {f}: {e}")
        temp = pd.read_csv(f, encoding="cp1252")
        print(f"Loaded {f} with cp1252 fallback")
    df_list.append(temp)

df = pd.concat(df_list, ignore_index=True)
df.to_csv(folder + "/combined.csv", index=False, encoding="utf-8") 
print("Combined dataset saved in UTF-8:", df.shape)


Loaded C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Original datasets\afrisenti_twi_dev.csv with UTF-8
Loaded C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Original datasets\afrisenti_twi_test.csv with UTF-8
Loaded C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Original datasets\afrisenti_twi_train.csv with UTF-8
Encoding error in C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Original datasets\labeledl_twi_tweets.csv: 'utf-8' codec can't decode byte 0x92 in position 17: invalid start byte
Loaded C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/Datasets/Original datasets\labeledl_twi_tweets.csv with cp1252 fallback
Combined dataset saved in UTF-8: (10814, 2)


In [14]:
# Printing the count of all labels

print(df["label"].value_counts())

label
Positive               3542
positive               2277
negative               1815
Neutral                1308
Negative               1064
neutral                 726
No Sentiment             49
Positive & Negative      26
Multilingual              3
Twi                       3
Ghanaian Pidgin           1
Name: count, dtype: int64


### Remove all broken text from the dataset x  Normalise dataset by replacing 3s, ?s and ) with ɛ and ɔ

In [21]:
import re
import pandas as pd
from ftfy import fix_text

def fix_broken(text):
    return fix_text(text)

def normalize_twi(text):
    text = text.replace("3", "ɛ")   # map 3 → ɛ
    text = text.replace(")", "ɔ")   # map ) → ɔ
    text = text.replace("ɛɛ", "ɛ")  # clean duplicates
    text = text.replace("?y?", "ɛyɛ")  # common corruption
    return text

def restore_twi_chars(text):
    text = re.sub(r"\?y\?", "ɛyɛ", text)
    text = re.sub(r"\?k\?", "ɔkɔ", text)

    start_map = {
        "b": "ɛb",
        "d": "ɔd",
        "m": "ɔm",
        "n": "ɛn",
        "k": "ɛk",
        "t": "ɛt",
        "s": "ɛs",
        "w": "ɔw",
    }

    for c, rep in start_map.items():
        text = re.sub(rf"\?{c}", rep, text)

    end_map = {
        "m": "mɛ",
        "n": "nɛ",
        "t": "tɔ",
        "b": "bɔ",
        "w": "wɔ",
        "s": "sɛ",
        "k": "kɔ",
        "f": "fɔ",
        "h": "hɔ",
        "p": "pɔ",
        "r": "rɛ",
        "e": "eɛ",
        "a": "aɛ",
    }

    for c, rep in end_map.items():
        text = re.sub(rf"{c}\?", rep, text)

    return text

combined = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined.csv")

combined["tweet"] = (combined["tweet"]
                     .apply(fix_broken)
                     .apply(normalize_twi)
                     .apply(restore_twi_chars))

combined["label"] = combined["label"].str.lower()

combined.to_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined_1.csv", index=False)

print("Saved to combined_1.csv")


Saved to combined_1.csv


### Increase dataset size via augmentation by a target value, such that all labels have similar values (can be set)

In [23]:
lexicon = {
    "neutral": {
        "na wo dea den": ["me ho yɛ den", "wo ho ntɔ wo", "wo ho yɛ hu"],
        "gyimie saaaa na afi aso": ["gyimie saaaa", "yɛ ɔkwasea saaaa na afi aso", "yɛ ɔkwasea saaa"],
        "gyaii nsem hunuu no na bu oman no yiee": ["gyae nkwaseasɛm no na mo ne ɔman no nni yie", "gyae nkwaseasɛm no", "gyaii nsem hunuu no na bu ɔman"],
        "covid19 nkoa mponi na ebola abaframu the game enter different level": ["covid19 nkoa na seesei ebola abaframu the game enter level eii", "covid19 nko ara na seesei ebola aba afra mu", "covid19 yɛ den oo"],
        "is full example of fear of man and stop saman": ["yɛ onipa suro ho nhwɛso a edi mu na ɛnyɛ saman", "is example of onipa suro na ɛnyɛ ahonhommɔne", "onipa ho suro oo"],
        "aden wo yɛ akwadaa": ["nea enti a woyɛ abofra", "aden enti a woyɛ abofra", "opanyin te sɛ ɔno"],
        "dabi me dii bɛgo lol": ["dabi me dii paanoo lol", "dabii me di rice", "dabi me di fufu lol"],
        "e choke u anaa": ["me ho yɛ bɔne", "me ho ntɔ me", "me ho yɛ hu"],
        "nika no da me ha nso wall no": ["nka no da me ha nso wall no", "me nkutoo na mewɔ biribi a emu yɛ duru", "nanso ɛda ha ne me nso ɔfasuo no"],
        "twam wogoro twer ne": ["twam wɔgoro twɛ ne", "pass wɔn di agorɔ twɛn na", "nanso wodi agorɔ na wɔtwɛn"],
        "kraman mullion eii": ["kraman million eii", "ɔpepem akraman eii", "ɔpepem akraman eii"],
        "dabi wasori": ["me ho yɛ bɔne", "me ho ntɔ me", "me ho yɛ hu"],
        "akakasuro ne sorowisa": ["akakasuro ne soro wisa", "ehu ne ɔsoro home", "ehu ne sorowisa"],
        "ankasa nasu yɛ din": ["nasu ankasa no yɛ din", "nasu yɛ din", "kofi yɛ din a wɔde frɛ no daa"],
        "nyɛ saa na merep akyer": ["ɛnyɛ saa na merep akyer", "ɛnyɛ ɛno ne nea merebɔ mmɔden sɛ mɛka", "ɛno ne nea merebɔ mmɔden sɛ mɛka"],
        "natsecurity foc nso se sn": ["me ho yɛ bɔne", "me ho ntɔ me", "me ho yɛ hu"],
        "eeeii saa mennim da ??": ["eeeii enti na mennim da ??", "enti na minnim da", "enti efi bere bɛn?"],
        "oh saa??": ["oh saa na ɛte??", "saa na ɛte", "oo saa"],
        "shocki me mpo??": ["shocking me mpo??", "ɛyɛ me ahodwiriw mpo", "mpo nie"],
    },
    "negative": {
        "kwasia sem saa na wabodam": ["kwasiasɛm saa na wa bo dam", "saa ɔkwasea na ɔyɛ kraman", "kwasiasɛm saa na wa gyimi"],
        "wo brofo nso nyɛ d ??": ["wo borofo nso nyɛ d ??", "wo english nso nyɛ papa koraa", "wo brofo nso nyɛ papa"],
        "konkonsani tena kurom a kuro bɔ": ["kɔnkɔnsani tena kurom bi?", "konkonsa bi te kuro yi mu nso?", "nsɛmmɔnedi wɔ baabiara"],
        "wonni sika nanso wopɛ mmaa yɛde anomdwa tu tonga": ["wonni sika nanso wopɛ mmea de wɔn ano tow tonga", "wonni sika nanso hwɛ sɛ wɔrekasa", "wonni sika nanso hwɛ wɔn"],
        "anisohye saa na wafa mu ewu": ["aniwa saa na wafa so awu", "enti na wafa so awu", "ɔnam so awu"],
        "saa ppp nsemhunu nkoaa": ["saa paa nsemhunu nkoaa", "enti nkwaseasɛm ankasa nkutoo", "enti nkwaseasɛm nkutoo"],
        "koko nkoaa aden wondidi biom anaa": ["koko nkoaa aden wondidi biom anaa", "koko nkoaa aden wondidi", "coconut nkoaa nea enti a wonnni bio anaa"],
        "masa wo yare anaa pt": ["masa, wo yare anaa", "woyare anaa?", "so woafi w’adwene mu?"],
        "mekyiri dɔkono ne nkyenam": ["me tan dɔkono ne nkyene", "me tan evil ne mpataa a wɔayam", "me tan kyenam"],
        "wo yɛ aboa paa ɔɔɔɔɔɔɔɔɔɔ wo saa akoa wei??": ["wo yɛ aboa paa ɔɔɔɔɔɔɔɔɔɔ wo saa akoa no??", "wobetumi aboa wo saa akoa no??", "wo nkoa"],
        "sia nanka yɛnni pilolo anaa": ["sia nanso yɛnni pilolo anaa", "nanso yɛnni pilolo sia", "nanka yɛnni pilolo sia"],
        "i shock sef mo na mo bɛ tschew koraaaa": ["me shock me ho sɛ wo bɛ tschew koraa", "me shock me ho sɛ wo tschew koraa", "wo bɛ tschew"],
        "born one ɛna wu kyere wu hu saa nu eii eugenia": ["awo baako ɛna wo kyere wo hunu se eii", "awo baako ɛna wo kyere wo hunu se eii", "awo ɛna baako wo kyere wo hunu sɛ eii"],
        "john kuma paaaaaaaa funeral awwwwww ghana ne gyimie": ["john kuma paaaaaaaa ayie awwwwww ghana ne nkwasea", "ghana ne nkwaseafoɔ", "aww ayie"],
        "yakubu sɛ preko yɛ haram nanso ɔdi trumu eiii": ["yakubu sɛ preko yɛ haram", "preko yɛ haram nanso ɔdi trumu", "ɔdi trumu eiii"],
        "firi yɛn so kɔ": ["gyae yɛn", "firi yɛn so kɔ", "twe wo ho fi yɛn ho"],
        "sika na shisika no ashi": ["sika no akɔ", "sika no ahyew", "sika no ahyew"],
        "ohia kasa fa womuã na wo didi atem": ["ohia ka womuã ho na wo bu atɛn", "wo bu atɛn", "ohia ka womuã ho na wo bu atɛn"],
        "ashawofo na ɛdi valentine": ["ashawofo na ɛdi valentine", "nguaman di valentine", "nguaman di afahyɛ"],
    },
}


data = []
for label, phrases in lexicon.items():
    for canonical, variants in phrases.items():
        data.append({"tweet": canonical, "label": label})
        for v in variants:
            data.append({"tweet": v, "label": label})

lexicon_df = pd.DataFrame(data)

combined = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined_1.csv")

# Target size = original size + 60%
target_size = int(len(combined) * 1.6)
needed = target_size - len(combined)

if needed > len(lexicon_df):
    sampled = lexicon_df.sample(n=needed, replace=True, random_state=42)
else:
    sampled = lexicon_df.sample(n=needed, replace=False, random_state=42)

# Merge
augmented = pd.concat([combined, sampled], ignore_index=True)

augmented.to_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined_2.csv", index=False)
print(f"Augmentation complete: {len(combined)} → {len(augmented)} rows")



Augmentation complete: 10814 → 17302 rows


In [6]:
# Printing the count of all labels

import pandas as pd

augmented = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined_2.csv")
print(augmented["label"].value_counts())

label
negative    6188
positive    5820
neutral     5294
Name: count, dtype: int64


### Clean dataset (drop NaNs + drop bad labels)

In [7]:

import pandas as pd

df = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined_2.csv")

df = df.dropna(subset=["tweet", "label"])

valid_labels = {"neutral", "negative", "positive"}
df = df[df["label"].str.lower().isin(valid_labels)]

df.to_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined_3.csv", index=False)

print(f"Cleaned dataset: {df.shape[0]} rows kept")


Cleaned dataset: 17302 rows kept


### Split dataset into 80, 10 and 10

In [9]:

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/combined_3.csv")

train, temp = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

dev, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp["label"])

train.to_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/train.csv", index=False)
dev.to_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/dev.csv", index=False)
test.to_csv("C:/Users/T-Plug/Desktop/University/Level 300/2nd Sem/316 Social Media Mining/End_Of_Sem_Project/datasets/test.csv", index=False)

print(f"Train: {len(train)}, Dev: {len(dev)}, Test: {len(test)}")

Train: 13841, Dev: 1730, Test: 1731
