In [1]:
import math

import pandas as pd

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

true_train_lines = train_data["label"].value_counts()[1]
fake_train_lines = train_data["label"].value_counts()[0]

true_test_lines = test_data["label"].value_counts()[1]
fake_test_lines = test_data["label"].value_counts()[0]

print(f"True train data count: {true_train_lines}")
print(f"Fake train data count: {fake_train_lines}")

print(f"True test data count: {true_test_lines}")
print(f"Fake test data count: {fake_test_lines}")


True train data count: 24352
Fake train data count: 20193
True test data count: 5219
Fake test data count: 4327


In [2]:
# Calculate the baseline metrics
if true_train_lines > fake_train_lines:
    print("Model will predict true for all test data")
    # True Positives (TP) and False Negatives (FN)
    tp = true_test_lines
    fn = 0
    # False Positives (FP) and True Negatives (TN)
    fp = fake_test_lines
    tn = 0
else:
    print("Model will predict fake for all test data")
    # True Positives (TP) and False Negatives (FN)
    tp = 0
    fn = true_test_lines
    # False Positives (FP) and True Negatives (TN)
    fp = 0
    tn = fake_test_lines

# Calculate metrics
accuracy = (tp + tn) / (true_test_lines + fake_test_lines)
precision_true = tp / (tp + fp) if (tp + fp) > 0 else math.nan
recall_true = tp / (tp + fn) if (tp + fn) > 0 else math.nan
f1_true = 2 * (precision_true * recall_true) / (precision_true + recall_true) if (precision_true + recall_true) > 0 else math.nan

precision_fake = tn / (tn + fn) if (tn + fn) > 0 else math.nan
recall_fake = tn / (tn + fp) if (tn + fp) > 0 else math.nan
f1_fake = 2 * (precision_fake * recall_fake) / (precision_fake + recall_fake) if (precision_fake + recall_fake) > 0 else math.nan

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision (True): {precision_true}")
print(f"Recall (True): {recall_true}")
print(f"F1 Score (True): {f1_true}")
print(f"Precision (Fake): {precision_fake}")
print(f"Recall (Fake): {recall_fake}")
print(f"F1 Score (Fake): {f1_fake}")

Model will predict true for all test data
Accuracy: 0.5467211397443955
Precision (True): 0.5467211397443955
Recall (True): 1.0
F1 Score (True): 0.7069420927869963
Precision (Fake): nan
Recall (Fake): 0.0
F1 Score (Fake): nan
