In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

#EXPERIMENT ONE
drive.mount('/drive')

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

max_length = 512
def encode(text):
    return tokenizer.encode(text, max_length=max_length, padding="max_length", truncation=True)

df = pd.read_csv("/drive/My Drive/dataset_experiment_one.csv")


encoded_sequences = df['content'].apply(encode)
padded_sequences = np.array(encoded_sequences.tolist())

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['tag'])

X_temp, X_test, y_temp, y_test = train_test_split(padded_sequences, labels, test_size=0.1, stratify=labels, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.11, stratify=y_temp, random_state=42)



vocab_size = tokenizer.vocab_size

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

loss, accuracy = model.evaluate(X_test, y_test)
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1-score:", f1_score(y_test, y_pred, average='weighted'))
print(f"Test Accuracy: {accuracy:.5f}")
print(f"Test Loss: {loss:.5f}")

model.save("/drive/My Drive/cnn_real_testing_train_and_test.keras")

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

#EXPERIMENt TWO
drive.mount('/drive')

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

max_length = 512
def encode(text):
    return tokenizer.encode(text, max_length=max_length, padding="max_length", truncation=True)


train_df = pd.read_csv("/drive/My Drive/dataset_experiment_two_training.csv")
test_df = pd.read_csv("/drive/My Drive/dataset_experiment_two_testing.csv")


encoded_sequences_train = train_df['content'].apply(encode)
padded_sequences_train = np.array(encoded_sequences_train.tolist())
encoded_sequences_test = test_df['content'].apply(encode)
padded_sequences_test = np.array(encoded_sequences_test.tolist())

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(train_df['tag'])

X_train, X_val, y_train, y_val = train_test_split(padded_sequences_train, labels, test_size=0.1, stratify=labels, random_state=42)
X_test = padded_sequences_test
y_test = label_encoder.transform(test_df['tag'])





vocab_size = tokenizer.vocab_size

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

loss, accuracy = model.evaluate(X_test, y_test)
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1-score:", f1_score(y_test, y_pred, average='weighted'))
print(f"Test Accuracy: {accuracy:.5f}")
print(f"Test Loss: {loss:.5f}")

model.save("/drive/My Drive/cnn_experiment_two.keras")

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

#EXPERIMENT THREE
drive.mount('/drive')

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

max_length = 512
def encode(text):
    return tokenizer.encode(text, max_length=max_length, padding="max_length", truncation=True)

df = pd.read_csv("/drive/My Drive/dataset_experiment_three.csv")


encoded_sequences = df['content'].apply(encode)
padded_sequences = np.array(encoded_sequences.tolist())

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['tag'])

X_temp, X_test, y_temp, y_test = train_test_split(padded_sequences, labels, test_size=0.1, stratify=labels, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.11, stratify=y_temp, random_state=42)

vocab_size = tokenizer.vocab_size

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

loss, accuracy = model.evaluate(X_test, y_test)
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1-score:", f1_score(y_test, y_pred, average='weighted'))
print(f"Test Accuracy: {accuracy:.5f}")
print(f"Test Loss: {loss:.5f}")

model.save("/drive/My Drive/cnn_experiment_three.keras")

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

#EXPERIMENT FOUR
drive.mount('/drive')

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

max_length = 512
def encode(text):
    return tokenizer.encode(text, max_length=max_length, padding="max_length", truncation=True)

df = pd.read_json("hf://datasets/mihalca/FakeRO_updated/combined_balanced.json")


encoded_sequences = df['content'].apply(encode)
padded_sequences = np.array(encoded_sequences.tolist())

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['tag'])

X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, labels, test_size=0.2, stratify=labels, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

vocab_size = tokenizer.vocab_size
num_classes = len(label_encoder.classes_)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model.build(input_shape=(None, max_length))
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

loss, accuracy = model.evaluate(X_test, y_test)
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))
print(f"Test Accuracy: {accuracy:.5f}")
print(f"Test Loss: {loss:.5f}")

model.save("/drive/My Drive/cnn_experiment_four.keras")