In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

In [2]:
input_csv_path = 'Datasets/combined_dataset.csv'

In [4]:
df = pd.read_csv(input_csv_path)
print(df.head())

   label                                               text
0      1  Se não querem o Varandas como Presidente caso ...
1      0  Le produit de Sony ne semble pas presque aussi...
2      1  Se alguém falar mal de ti pelas costas ..... P...
3      0                     @HatanoSayuri Me obriga &gt;:(
4      1                 @mana_eliana Oi? Estou à espera :)


In [5]:
print(df.isnull().sum())

label    0
text     0
dtype: int64


In [6]:
def clean_text_phase1(text):
    """
    Initial text cleaning: lowercase, remove URLs, mentions, special characters, digits.
    """
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
print(nltk.data.find('corpora/stopwords'))
print(nltk.data.find('tokenizers/punkt'))


/Users/chyavanshenoy/nltk_data/corpora/stopwords
/Users/chyavanshenoy/nltk_data/tokenizers/punkt


In [9]:
STOP_WORDS_ENGLISH = set()
STOP_WORDS_PORTUGUESE = set()
STOP_WORDS_FRENCH = set()
STOP_WORDS_NEPALI = set()
STOP_WORDS_HINDI = set()

In [10]:
STOP_WORDS_ENGLISH = set(stopwords.words('english'))
STOP_WORDS_PORTUGUESE = set(stopwords.words('portuguese'))
STOP_WORDS_FRENCH = set(stopwords.words('french'))
STOP_WORDS_NEPALI = set(stopwords.words('nepali'))

In [14]:
COMBINED_STOPWORDS = STOP_WORDS_ENGLISH.union(STOP_WORDS_PORTUGUESE) \
                                 .union(STOP_WORDS_FRENCH) \
                                 .union(STOP_WORDS_NEPALI) \
                                 .union(STOP_WORDS_HINDI)

In [15]:
def remove_stopwords_from_tokens(text_cleaned_phase1):
    """
    Tokenizes text and removes stopwords and non-alphabetic tokens.
    Assumes text is already lowercased and had URLs, mentions, etc. removed.
    """
    if not text_cleaned_phase1 or pd.isna(text_cleaned_phase1):
        return ""
        
    tokens = word_tokenize(text_cleaned_phase1)
    
    filtered_tokens = [
        word for word in tokens if word.isalpha() and word not in COMBINED_STOPWORDS
    ]
    return ' '.join(filtered_tokens)

In [16]:
df['text_intermediate_clean'] = df['text'].astype(str).apply(clean_text_phase1)

In [17]:
print(df[['text', 'text_intermediate_clean']].head())

                                                text  \
0  Se não querem o Varandas como Presidente caso ...   
1  Le produit de Sony ne semble pas presque aussi...   
2  Se alguém falar mal de ti pelas costas ..... P...   
3                     @HatanoSayuri Me obriga &gt;:(   
4                 @mana_eliana Oi? Estou à espera :)   

                             text_intermediate_clean  
0  se não querem o varandas como presidente caso ...  
1  le produit de sony ne semble pas presque aussi...  
2       se alguém falar mal de ti pelas costas peide  
3                                       me obriga gt  
4                                  oi estou à espera  


In [18]:
df['text_clean'] = df['text_intermediate_clean'].apply(remove_stopwords_from_tokens)

In [19]:
print(df[['text_intermediate_clean', 'text_clean']].head())

                             text_intermediate_clean  \
0  se não querem o varandas como presidente caso ...   
1  le produit de sony ne semble pas presque aussi...   
2       se alguém falar mal de ti pelas costas peide   
3                                       me obriga gt   
4                                  oi estou à espera   

                                          text_clean  
0  querem varandas presidente caso vença eleições...  
1  produit sony semble presque aussi intéressant ...  
2                   alguém falar mal ti costas peide  
3                                          obriga gt  
4                                          oi espera  


In [20]:
df_final_cleaned = df[['text_clean', 'label']].copy()

In [21]:
df_final_cleaned.dropna(subset=['text_clean', 'label'], inplace=True)

In [22]:
df_final_cleaned = df_final_cleaned[df_final_cleaned['text_clean'].str.strip().astype(bool)]
df_final_cleaned.reset_index(drop=True, inplace=True)

In [23]:
print(df_final_cleaned['label'].value_counts(normalize=True))

label
0    0.483724
1    0.449766
2    0.066510
Name: proportion, dtype: float64


In [24]:
output_directory = 'Datasets'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [25]:
output_file_path = os.path.join(output_directory, 'combined_dataset.parquet')

In [27]:
df_final_cleaned.to_parquet(output_file_path, index=False)