In [56]:
# Import necessary libraries
import spacy
import nltk
from nltk.util import ngrams
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import unicodedata
from datasets import load_dataset
import jsonlines
from huggingface_hub import hf_hub_download

from nltk.tokenize import TweetTokenizer

nltk.download("punkt")
nltk.download("punkt_tab")

tokenizer = TweetTokenizer(
    preserve_case=False,
    strip_handles=True,
    reduce_len=True
)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lingl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lingl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Datasets for training n-grams:  
- "aisyahhrazak/crawl-fiksyenshasha" :   
    - https://huggingface.co/datasets/aisyahhrazak/crawl-fiksyenshasha  
    - Data scraped from https://fiksyenshasha.com/  
    - A website for submitting (mostly horror, personal or fiction) stories.  
    - text columns: headline, content, comment  

- "malaysia-ai/fb-malaysian-pages" :  
    - https://huggingface.co/datasets/malaysia-ai/fb-malaysian-pages  
    - Data from Malaysian Facebook pages
    - text columns: text



In [57]:
# Load "aisyahhrazak/crawl-fiksyenshasha" dataset

dataset = load_dataset(
    "aisyahhrazak/crawl-fiksyenshasha",
    split="train"
)

# create a list to collect text
all_texts_c = []

for row in dataset:
    # headline (string)
    if row["headline"]:
        all_texts_c.append(row["headline"])

    # content (list of strings)
    if row["content"]:
        all_texts_c.extend(row["content"])

    # comment (list of strings)
    if row["comment"]:
        all_texts_c.extend(row["comment"])




In [58]:
print("Collected:", len(all_texts_c))
print("First 3 elements:", all_texts_c[:3])


Collected: 353160
First 3 elements: ['KISAH SERAM JALAN SANDAKAN-KOTA KINABALU', 'Kisah ini bermula ketika saya berumur 23 tahun. Dalam umur yang masih muda saya sudah bekerja d sebuah agensi pekerjaan di sandakan. Jiwa remaja sememangnya banyak perkara yang ingin dilakukan. Saya, nana dan wan merancang untuk ke kota kinabalu awal bulan april. Seperti yang sudah di rancang. Tepat jm6 petang kami bertolak dari sandakan. Disebabkan saya masih kerja pada hari itu, kami terpaksa bertolak sebelah petang. Perjalanan dari sandakan ke kota kinabalu memakan masa 7-8jam terpulang dari kelajuan masing2. Okey pada jam7 malam kami singgah mengisi minyak di check point dan membeli makanan ringan utk mengisi perut dalam perjalanan.', 'Semasa kami melalui jalan telupid ranau hati saya merasa tidak sedap. Tapi sebagai pemandu saya kuatkan radio sekali sekala ikut menyanyi. Dalam hati syukur ada bas di belakang. Semasa melalui jlan yang agak gelap (jalan telupid ranau ni memang hutan tiada lampu jalan) 

In [59]:
# Load "malaysia-ai/fb-malaysian-pages" dataset
# Dataset formatting issue, need to manually read JSONL (inconsistent data types, JSON objects or strings)

file_path = hf_hub_download(
    repo_id="malaysia-ai/fb-malaysian-pages",
    filename="dedup.jsonl",
    repo_type="dataset"
)

all_texts_f = []

with jsonlines.open(file_path) as reader:
    for obj in reader:
        # normal JSON object
        if isinstance(obj, dict):
            text = obj.get("text")

        # string
        elif isinstance(obj, str):
            text = obj

        else:
            continue

        if isinstance(text, str) and text.strip():
            all_texts_f.append(text)



In [60]:
print("Collected:", len(all_texts_f))
print("First 3 elements:", all_texts_f[:3])


Collected: 193363
First 3 elements: ['Adeyyy pooodahhh puiii', 'Bosan la ceramah dia.. dulu sonok gak', 'Memalukan betul.. PM malaysia.\nDunia sedang memerhatikan Malaysia dipimpin oleh orang yang tercemar dgn kes2 mahkamah... \n\nMalaysia PM Picks Graft-Tainted Leader as One of His Deputies https://\nwww.bloomberg.co\nm/news/\narticles/\n2022-12-02/\nmalaysia-pm-pick\ns-graft-tainted\n-leader-as-one-\nof-his-deputies']


In [61]:

# Process datasets
def process_datasets(text):
    """
    Cleans and tokenizes text.

    Returns:
        tokenized: list of tokenized sentences (list of lists of words)
    """
    # get rid of URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # split into a list of sentences
    sentences = nltk.sent_tokenize(text)

    tokenized = []

    # split each sentence into a list of words
    for s in sentences:

        # clean: to lowercase, only 
        s = s.lower()
        s = re.sub(r"[^a-zàâéèêëîïôûùüçñ\s]", " ", s)
        s = re.sub(r"\s+", " ", s).strip()

        tokens = nltk.word_tokenize(s)
        tokenized.append(tokens)

    return tokenized



In [None]:
def tokenize_list(texts):
    """
    Cleans and tokenizes a list of texts.
    """
    all_tokenized = []

    for text in texts:
        tokenized = process_datasets(text)
        all_tokenized.extend(tokenized)

    return all_tokenized

In [66]:
all_tokenized_c = tokenize_list(all_texts_c)
all_tokenized_f = tokenize_list(all_texts_f)

In [71]:
all_tokenized = all_tokenized_c + all_tokenized_f

print("Tokenized stories:", all_tokenized_c[:3])
print("Number of sentences:", len(all_tokenized_c))

print("Tokenized Facebook posts:", all_tokenized_f[:3])
print("Number of sentences:", len(all_tokenized_f))

print("All:", len(all_tokenized))

Tokenized stories: [['kisah', 'seram', 'jalan', 'sandakan', 'kota', 'kinabalu'], ['kisah', 'ini', 'bermula', 'ketika', 'saya', 'berumur', 'tahun'], ['dalam', 'umur', 'yang', 'masih', 'muda', 'saya', 'sudah', 'bekerja', 'd', 'sebuah', 'agensi', 'pekerjaan', 'di', 'sandakan']]
Number of sentences: 1241358
Tokenized Facebook posts: [['adeyyy', 'pooodahhh', 'puiii'], ['bosan', 'la', 'ceramah', 'dia', 'dulu', 'sonok', 'gak'], ['memalukan', 'betul', 'pm', 'malaysia']]
Number of sentences: 241103
All: 1482461


In [None]:

# Preprocess tweets
def preprocess_tweets(text):
    if not isinstance(text, str):
            return ""

    # Clean tweet-specific noise
    text = p.clean(text)  # removes URLs, mentions, emojis, RT, etc.

    # Tokenize
    tokens = tokenizer.tokenize(text)

    # Keep only alphabetic tokens (supports accented chars)
    tokens = [t for t in tokens if t.isalpha()]

    return " ".join(tokens)

# -------- load ONE file --------
file_path = "tweets/extracted_data0.csv"

df = pd.read_csv(file_path)

# -------- clean tweets --------
df["clean_text"] = df["tweet_text"].apply(preprocess_tweets)

# -------- remove short tweets (<4 words) --------
df["word_count"] = df["clean_text"].str.split().str.len()
df_clean = df[df["word_count"] >= 4]

# -------- final list of cleaned tweets --------
cleaned_tweets = df_clean["clean_text"].tolist()

  df = pd.read_csv(file_path)


In [9]:
print(cleaned_tweets[:20])  # Print first 5 cleaned tweets for verification
print(len(cleaned_tweets))  # Print first 5 cleaned tweets for verification

['rasanya takde mufti yang ulas so boleh ikut mufti perlis hukumnya unless ada pandangan mufti lain', 'rt amp pmr outreach sekitar kawasan taman sri petaling di parlimen p seputeh pada april kawasan yang', 'rt manga your lie in april', 'rt terkini mahkamah rayuan membenarkan sam ke ting untuk membuat rayuan sam ke ting sebelum ini dihukum penjara tahun dan', 'rt x giving away dobiesnft free mint spots rules to enter follow', 'rt gaji berpuluh ribu berbulan diam membisu rakyat berbulan bising tak ambil pusing payung kuning jentik baru tergedi', 'i don t wanna go outstation i like my room', 'rt instagram post p feel my rhythm yoon seoha', 'mesti diaorang yang terima kat ppr batu muda blok a rasa happy kan', 'rt this is it this is the video this is what you mean by being in this bangtan sonyeondan shit for life', 'rt kes penemuan mayat wanita di bukit putus dan rangka kanak lima tahun di bukit zamrud negeri sembilan dua beranak kh', 'invaded by funded by usa sympathy for ukraine save pale

In [31]:

# Train n-gram language models
def train_ngram_models(corpus, n=3):
    """
    Train n-gram language models for Malay corpora with nltk, and download english n-gram model.
    """
    training_ngrams, padded_sentences = padded_everygram_pipeline(n, corpus)
    model = MLE(n)
    model.fit(training_ngrams, padded_sentences)

    return model


In [32]:
model_ms = train_ngram_models(all_tokenized)

In [None]:
# Compute perplexity
def compute_perplexity(model, text):
    """
    Compute the perplexity of all tweets with the trained n-gram models.
    """
    pass


In [None]:

# Classify tweets
def classify_tweets(tweets, english_model, malay_model):
    """
    Classify tweets into the two types of code-mixing or discard strictly Malay tweets.
    """
    pass


In [None]:

# Analyze grammatical structure
def analyze_grammar(tweet):
    """
    Analyze the grammatical structure of a tweet to detect non-standard English patterns.
    """
    pass


In [None]:

# Compute additional statistics
def compute_statistics(tweets):
    """
    Compute additional statistics such as sentence length, CMI, Multilingual Index, etc.
    Maybe do one cell / function per statistic instead of putting everything here.
    """
    pass


In [None]:

# Plot statistics
def plot_statistics(statistics):
    """
    Plot the computed statistics using matplotlib or seaborn.
    """
    pass


In [None]:

# Main execution
def main():
    """
    Main function to execute the workflow so that the code's execution can be easily managed and stays very lisible.
    """
    pass