In [3]:
# --- Import libraries ---
import pandas as pd
import numpy as np
import re
import string
import random
import unicodedata
from spellchecker import SpellChecker
import torch

In [2]:
!pip install pyspellchecker


Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.3


In [4]:
# --- Load dataset ---
df = pd.read_csv("Final_dataset.csv")

In [5]:
# --- Make a copy of the Tweets column ---
df_clean = pd.DataFrame()
df_clean["Tweets"] = df["Tweets"].str.lower()  # lowercase


In [6]:
# --- Function to remove HTML tags ---
def remove_html_tag(text):
    return re.sub(r'<.*?>', '', text)


In [7]:
# --- Function to remove URLs ---
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)


In [8]:
# --- Function to remove punctuation ---
def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))


In [9]:
# --- Function to remove emojis ---
def remove_emoji(text):
    if isinstance(text, str):
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags
            u"\U00002700-\U000027BF"  # Dingbats
            u"\U000024C2-\U0001F251"  # misc
        "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    return text

In [10]:
# --- Function to detect Bangla text ---
def contains_bangla(text):
    return bool(re.search(r'[\u0980-\u09FF]', str(text)))


In [11]:
# --- Clean text step by step ---
df_clean['Tweets'] = df_clean['Tweets'].apply(remove_html_tag)
df_clean['Tweets'] = df_clean['Tweets'].apply(remove_url)
df_clean['Tweets'] = df_clean['Tweets'].apply(remove_punc)
df_clean['Tweets'] = df_clean['Tweets'].apply(remove_emoji)


In [12]:
# --- Remove Bangla texts ---
df_clean['has_bangla'] = df_clean['Tweets'].apply(contains_bangla)
df_clean = df_clean[~df_clean['has_bangla']].drop(columns=['has_bangla'])


In [13]:
# --- Basic text fixes ---
df_clean = df_clean.applymap(lambda x: x.replace('\n', ' ') if isinstance(x, str) else x)
df_clean = df_clean.applymap(lambda x: x.replace('"', '').replace("'", '') if isinstance(x, str) else x)
df_clean = df_clean.applymap(lambda x: x.replace('!!', '') if isinstance(x, str) else x)


  df_clean = df_clean.applymap(lambda x: x.replace('\n', ' ') if isinstance(x, str) else x)
  df_clean = df_clean.applymap(lambda x: x.replace('"', '').replace("'", '') if isinstance(x, str) else x)
  df_clean = df_clean.applymap(lambda x: x.replace('!!', '') if isinstance(x, str) else x)


In [14]:
# --- Remove nulls and duplicates ---
df_clean.dropna(inplace=True)
df_clean.drop_duplicates(inplace=True)
df_clean.reset_index(drop=True, inplace=True)


In [16]:
# --- Add unique tweet ID ---
df_clean['tweet_id'] = random.sample(range(100000, 1000000), len(df_clean))

In [18]:
# --- Normalize unicode (remove accents, etc.) ---
df_clean['Tweets'] = df_clean['Tweets'].apply(
    lambda x: unicodedata.normalize('NFKD', x).encode('ASCII', 'ignore').decode('utf-8') if isinstance(x, str) else x
)


In [19]:
# --- Spelling correction ---
spell = SpellChecker()

def correct_spelling(text):
    try:
        words = text.split()
        corrected = [spell.correction(word) if word.isalpha() else word for word in words]
        return ' '.join(corrected)
    except:
        return text

df_clean['Tweets'] = df_clean['Tweets'].astype(str).apply(correct_spelling)

In [20]:
# --- Apply custom slang/shortform fixes ---
custom_fixes = {
    "youre": "you are",
    "im": "I am",
    "cant": "can not",
    "dont": "do not",
    "wont": "will not",
    "idk": "I do not know",
    "gonna": "going to",
    "gotta": "got to",
    "lemme": "let me",
    "goood": "good",
    "u": "you",
    "ur": "your",
    # Add more as needed
}

def apply_custom_fixes(text):
    words = text.split()
    fixed_words = [custom_fixes.get(w.lower(), w) for w in words]
    return ' '.join(fixed_words)

df_clean['Tweets'] = df_clean['Tweets'].astype(str).apply(apply_custom_fixes)

In [21]:
df_clean.to_csv("Processed_dataset.csv", index=False)

In [22]:
df_clean

Unnamed: 0,Tweets,tweet_id
0,bold hearts fearless minds meet the july revol...,753744
1,rickshaw that carried revolution martyr finds ...,135227
2,butcher of bangladesh hasina deployed un vehic...,164257
3,frustrated bal goon showing police how to shoo...,849649
4,if you know you know unpublished images from j...,236113
...,...,...
521,the workers of dhaka hitting the streets right...,185613
522,bangladesh pm sheikhhasina has left dhaka for ...,577807
523,curfew my foot update internet back in parts o...,653551
524,august 5 2024 bangladesh earned their second i...,634241
