In [None]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from google.colab import drive
import re
import pandas as pd
import spacy
import torch


drive.mount('/drive')
nlp = spacy.load("ro_core_news_lg")
df = pd.read_json("hf://datasets/mihalca/Fakerom_updated_original/combined_data.json")
df['tag'] = df['tag'].replace({
    "misinformation": "fake_news",
    "propaganda": "fake_news"
})
df = df[df["tag"] != "satire"]
df_filtered = df[df['content'].str.count(r'\b\w+\b') <= 550]


model_name_bart = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer_bart = MBart50TokenizerFast.from_pretrained(model_name_bart)
model_bart = MBartForConditionalGeneration.from_pretrained(model_name_bart)

device = "cuda" if torch.cuda.is_available() else "cpu"
model_bart = model_bart.to(device)
print(f"Model loaded on: {device}")

def translate(text, src_lang, tgt_lang):

    tokenizer_bart.src_lang = src_lang

    encoded = tokenizer_bart(text, return_tensors="pt").to(device)

    generated_tokens = model_bart.generate(
        **encoded,
        forced_bos_token_id=tokenizer_bart.lang_code_to_id[tgt_lang]
    )
    return tokenizer_bart.batch_decode(generated_tokens, skip_special_tokens=True)[0]

back_translated_data = []
cnt = 0
for index, row in df_filtered.iterrows():
    text = row['content']
    tag = row['tag']
    translated_text = translate(text, "ro_RO", "en_XX")
    back_translated_text = translate(translated_text, "en_XX", "ro_RO")
    cnt += 1
    print(cnt)
    back_translated_data.append({
        'content': back_translated_text,
        'tag': tag
    })

df_back_translated = pd.DataFrame(back_translated_data)

df_augmented = pd.concat([df_filtered, df_back_translated], ignore_index=True)

df_augmented.to_csv('/drive/My Drive/augmented_dataset_from_english.csv', index=False)

In [None]:
import pandas as pd
import re
from google.colab import drive
import spacy
from transformers import MarianMTModel, MarianTokenizer
import torch
from tqdm import tqdm


drive.mount('/drive')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

nlp = spacy.load("ro_core_news_lg")

df = pd.read_json("hf://datasets/mihalca/Fakerom_updated_original/combined_data.json")
df['tag'] = df['tag'].replace({
    "misinformation": "fake_news",
    "propaganda": "fake_news"
})
df = df[df["tag"] != "satire"]
df = df[df['content'].str.count(r'\b\w+\b') <= 550]

def load_model(model_name):
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    return tokenizer, model

ro_fr_tok, ro_fr_model = load_model("Helsinki-NLP/opus-mt-ro-fr")
fr_ro_tok, fr_ro_model = load_model("Helsinki-NLP/opus-mt-fr-ro")

def split_into_sentences_spacy(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

def translate(texts, tokenizer, model):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        translated = model.generate(
            **inputs,
            max_length=512,
            num_beams=4,
            early_stopping=False
        )
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

def back_translate(text):
    sentences = split_into_sentences_spacy(text)
    result = []
    for sent in sentences:
        try:
            fr = translate([sent], ro_fr_tok, ro_fr_model)[0]
            ro = translate([fr], fr_ro_tok, fr_ro_model)[0]
            result.append(ro)
        except:
            print("fallback")
            result.append(sent)
    return " ".join(result)

augmented_rows = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    original_text = row['content']
    label = row['tag']
    augmented_text = back_translate(original_text)
    augmented_rows.append({
        "tag": label,
        "content": augmented_text
    })

df_augmented = pd.DataFrame(augmented_rows)

df_augmented.to_csv('/drive/My Drive/augmented_dataset_from_french.csv', index=False)


In [None]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from google.colab import drive
import re
import pandas as pd
import spacy
import torch

#PREPROCESSING
drive.mount('/drive')
nlp = spacy.load("ro_core_news_lg")
df_augmented = pd.read_csv('/drive/My Drive/augmented_dataset_from_french.csv')

url_pattern = re.compile(r'https?://\S+')
def remove_urls(text):
    return url_pattern.sub('', text)


df_augmented['content'] = df_augmented['content'].apply(remove_urls)

df_augmented = df_augmented.replace(to_replace=r'\d', value='', regex=True)

def lemmatize_and_filter(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space])

df_augmented['content'] = df_augmented['content'].apply(lemmatize_and_filter)


df_augmented = df_augmented.map(lambda x: x.lower() if isinstance(x, str) else x)

df_augmented.to_csv('/drive/My Drive/augmented_dataset_final_preprocessed.csv', index=False)
