## Data Preprocessing

In [32]:
def stratified_downsample(df, sample_size):
    label_dist = df['label'].value_counts(normalize=True).to_dict()
    samples = []
    for label, ratio in label_dist.items():
        n = int(sample_size * ratio)
        part = df[df['label'] == label].sample(n=n, random_state=42)
        samples.append(part)
    return pd.concat(samples).sample(frac=1, random_state=42).reset_index(drop=True)

In [51]:
import pandas as pd

# ------------------------
# 🟢 1. Portuguese Dataset
# ------------------------

def process_portuguese(input_path, output_path):
    df = pd.read_csv(input_path, sep=';', quoting=3, encoding='utf-8', on_bad_lines='skip')

    # Keep only necessary columns
    df = df[['tweet_text', 'sentiment']]
    df.columns = ['text', 'label']
    df['language'] = 'pt'

    label_map = {
        '0': 'negative', '1': 'positive', '2': 'neutral',
        0: 'negative', 1: 'positive', 2: 'neutral'
    }
    df['label'] = df['label'].map(label_map)
    df = df[df['label'].isin(['positive', 'negative'])]
    df.info()
    # Stratified downsample
    sampled = stratified_downsample(df, 65000)
    sampled.to_csv(output_path, index=False)
    print(f"✅ Portuguese dataset saved: {output_path}")

In [49]:
# --------------------
# 🟢 2. English Dataset
# --------------------

def process_english(input_path, output_path):
    df = pd.read_csv(input_path)
    df = df[['Text', 'Label']]
    df.columns = ['text', 'label']
    df['language'] = 'en'

    df['label'] = df['label'].astype(str).str.lower().str.strip()
    df = df[df['label'].isin(['positive', 'negative'])]

    sampled = stratified_downsample(df, 65000)
    sampled.to_csv(output_path, index=False)
    print(f"✅ English dataset saved: {output_path}")

In [50]:
# -------------------
# 🟢 3. French Dataset
# -------------------

def process_french(input_path, output_path):
    df = pd.read_csv(input_path)
    df = df[['text', 'label']]
    df.columns = ['text', 'label']
    df['language'] = 'fr'

    label_map = {
        '0': 'negative', '1': 'positive', '2': 'neutral',
        0: 'negative', 1: 'positive', 2: 'neutral'
    }
    df['label'] = df['label'].map(label_map)
    df = df[df['label'].isin(['positive', 'negative'])]

    sampled = stratified_downsample(df, 65000)
    sampled.to_csv(output_path, index=False)
    print(f"✅ French dataset saved: {output_path}")

In [54]:
#process_portuguese("portuguese.csv", "portuguese_cleaned_65k.csv")
#process_english("english.csv", "english_cleaned_65k.csv")
process_french("french.csv", "french_cleaned_65k.csv")

✅ French dataset saved: french_cleaned_65k.csv


In [55]:
en = pd.read_csv("english_cleaned_65k.csv")
pt = pd.read_csv("portuguese_cleaned_65k.csv")
fr = pd.read_csv("french_cleaned_65k.csv")

combined = pd.concat([en, pt, fr])
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)
combined.to_csv("multilingual_sentiment_195k.csv", index=False)

print("✅ Combined dataset saved: multilingual_sentiment_195k.csv")

✅ Combined dataset saved: multilingual_sentiment_195k.csv


## EDA

In [57]:
df = pd.read_csv("multilingual_sentiment_195k.csv")
df.head()

Unnamed: 0,text,label,language
0,Que daora acho que meu título foi cancelado :)...,positive,pt
1,"Ok je ferai cela aussi, j'aurai une lecture de...",positive,fr
2,Mes cheveux sont éloquents! -___- je me sens c...,positive,fr
3,@cutesvantae @BTS_twt YAAAA :(((( eu amo demai...,negative,pt
4,Pourquoi pensez-vous que vous avez vécu en Aus...,negative,fr


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194997 entries, 0 to 194996
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      194996 non-null  object
 1   label     194997 non-null  object
 2   language  194997 non-null  object
dtypes: object(3)
memory usage: 4.5+ MB


In [59]:
print("Total samples:", len(df))

Total samples: 194997


In [60]:
print(df['language'].value_counts())

language
pt    64999
fr    64999
en    64999
Name: count, dtype: int64


In [61]:
pd.crosstab(df['language'], df['label'])

label,negative,positive
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,32211,32788
fr,32850,32149
pt,32518,32481
