In [10]:
# Import necessary libraries
import spacy
import nltk
from nltk.util import ngrams
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import unicodedata

from nltk.tokenize import TweetTokenizer

nltk.download("punkt")

tokenizer = TweetTokenizer(
    preserve_case=False,
    strip_handles=True,
    reduce_len=True
)


[nltk_data] Downloading package punkt to /home/moi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

# Load datasets
def clean(text):
  """
  Cleans text by converting to lowercase, removing line breaks,
  tabs and basic punctuation.
  """
  # convert to lowercase
  text = text.lower()

  # replace line breaks, tabs and basic punctuation with spaces
  for ch in ["\n","\t",".", ",", "!", "?", ":", ";", "(", ")", "[", "]", "{", "}", '"', "'"]:
      text = text.replace(ch, " ")

  # merge multiple spaces into one
  text = " ".join(text.split())

  return text


def load_datasets():
    """
    Loads Malay text dataset and cleans them.

    Returns:
        tokenized: list of tokenized sentences (list of lists of words)
    """

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    # split into a list of sentences
    sentences = nltk.sent_tokenize(text)

    tokenized = []

    # split each sentence into a list of words
    for s in sentences:
        s = clean(s)
        tokens = nltk.word_tokenize(s)
        tokenized.append(tokens)

    return tokenized


In [None]:

# Preprocess tweets
def preprocess_tweets(text):
    if not isinstance(text, str):
            return ""

    # Clean tweet-specific noise
    text = p.clean(text)  # removes URLs, mentions, emojis, RT, etc.

    # Tokenize
    tokens = tokenizer.tokenize(text)

    # Keep only alphabetic tokens (supports accented chars)
    tokens = [t for t in tokens if t.isalpha()]

    return " ".join(tokens)

# -------- load ONE file --------
file_path = "tweets/extracted_data0.csv"

df = pd.read_csv(file_path)

# -------- clean tweets --------
df["clean_text"] = df["tweet_text"].apply(preprocess_tweets)

# -------- remove short tweets (<4 words) --------
df["word_count"] = df["clean_text"].str.split().str.len()
df_clean = df[df["word_count"] >= 4]

# -------- final list of cleaned tweets --------
cleaned_tweets = df_clean["clean_text"].tolist()

  df = pd.read_csv(file_path)


In [9]:
print(cleaned_tweets[:20])  # Print first 5 cleaned tweets for verification
print(len(cleaned_tweets))  # Print first 5 cleaned tweets for verification

['rasanya takde mufti yang ulas so boleh ikut mufti perlis hukumnya unless ada pandangan mufti lain', 'rt amp pmr outreach sekitar kawasan taman sri petaling di parlimen p seputeh pada april kawasan yang', 'rt manga your lie in april', 'rt terkini mahkamah rayuan membenarkan sam ke ting untuk membuat rayuan sam ke ting sebelum ini dihukum penjara tahun dan', 'rt x giving away dobiesnft free mint spots rules to enter follow', 'rt gaji berpuluh ribu berbulan diam membisu rakyat berbulan bising tak ambil pusing payung kuning jentik baru tergedi', 'i don t wanna go outstation i like my room', 'rt instagram post p feel my rhythm yoon seoha', 'mesti diaorang yang terima kat ppr batu muda blok a rasa happy kan', 'rt this is it this is the video this is what you mean by being in this bangtan sonyeondan shit for life', 'rt kes penemuan mayat wanita di bukit putus dan rangka kanak lima tahun di bukit zamrud negeri sembilan dua beranak kh', 'invaded by funded by usa sympathy for ukraine save pale

In [None]:

# Train n-gram language models
def train_ngram_models(corpus, n=3):
    """
    Train n-gram language models for Malay corpora with nltk, and download english n-gram model.
    """
    training_ngrams, padded_sentences = padded_everygram_pipeline(n, corpus)
    model = MLE(n)
    model.fit(training_ngrams, padded_sentences)

    return model


In [None]:
# Compute perplexity
def compute_perplexity(model, text):
    """
    Compute the perplexity of all tweets with the trained n-gram models.
    """
    pass


In [None]:

# Classify tweets
def classify_tweets(tweets, english_model, malay_model):
    """
    Classify tweets into the two types of code-mixing or discard strictly Malay tweets.
    """
    pass


In [None]:

# Analyze grammatical structure
def analyze_grammar(tweet):
    """
    Analyze the grammatical structure of a tweet to detect non-standard English patterns.
    """
    pass


In [None]:

# Compute additional statistics
def compute_statistics(tweets):
    """
    Compute additional statistics such as sentence length, CMI, Multilingual Index, etc.
    Maybe do one cell / function per statistic instead of putting everything here.
    """
    pass


In [None]:

# Plot statistics
def plot_statistics(statistics):
    """
    Plot the computed statistics using matplotlib or seaborn.
    """
    pass


In [None]:

# Main execution
def main():
    """
    Main function to execute the workflow so that the code's execution can be easily managed and stays very lisible.
    """
    pass