In [None]:
import pandas as pd
import yaml

with open("../../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)


df = pd.read_csv(cfg["fake_news"]["train_dataset"])
df_test = pd.read_csv(cfg["fake_news"]["test_dataset"])

In [None]:
df.columns

In [None]:
df_copy = df.copy()
columns_to_keep = ["id", "title", "text", "label"]
df_copy = df_copy.drop(
    columns=[col for col in df_copy.columns if col not in columns_to_keep]
)
df_test = df_test.drop(
    columns=[col for col in df_test.columns if col not in columns_to_keep]
)
df_copy

In [None]:
allowed_values = ["1", "0"]
df_copy = df_copy[df_copy["label"].isin(allowed_values)]
df_copy["label"].unique()

In [None]:
print(df_copy["title"][0])
print(df_copy["text"][0])

print(df_test["title"][0])
print(df_test["text"][0])

In [None]:
import re

df_clean = df_copy.dropna()
df_test = df_test.dropna()


def clean_text(text):
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(" +", " ", text)
    return text.lower()


df_clean["title"] = df_clean["title"].apply(clean_text)
df_clean["text"] = df_clean["text"].apply(clean_text)
df_clean["label"] = df_clean["label"].replace({"1": True, "0": False})

df_test["title"] = df_test["title"].apply(clean_text)
df_test["text"] = df_test["text"].apply(clean_text)

In [None]:
print(df_clean["title"][0])
print(df_clean["text"][0])

print(df_test["title"][0])
print(df_test["text"][0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_vectorizer = TfidfVectorizer(max_features=1000)
X_title = title_vectorizer.fit_transform(df_clean["title"])

text_vectorizer = TfidfVectorizer(max_features=5000)
X_text = text_vectorizer.fit_transform(df_clean["text"])

In [None]:
from scipy.sparse import hstack

X_combined = hstack([X_title, X_text])
y_train = df_clean["label"]

In [None]:
from lightgbm import LGBMClassifier
import os

os.environ["LOKY_MAX_CPU_COUNT "] = "6"

title_features = title_vectorizer.get_feature_names_out()
text_features = text_vectorizer.get_feature_names_out()

title_features = ["title_" + f for f in title_features]
text_features = ["text_" + f for f in text_features]

X_combined = hstack([X_title, X_text])
X_df = pd.DataFrame.sparse.from_spmatrix(
    X_combined, columns=title_features + text_features
)

model = LGBMClassifier()
model.fit(X_df, y_train)

In [None]:
X_test_title = title_vectorizer.transform(df_test["title"])
X_test_text = text_vectorizer.transform(df_test["text"])

X_test_combined = hstack([X_test_title, X_test_text])
X_test_df = pd.DataFrame.sparse.from_spmatrix(
    X_test_combined, columns=title_features + text_features
)

y_pred = model.predict(X_test_df)

In [None]:
df_result = pd.DataFrame({"id": df_test["id"], "label": y_pred.astype(int)})
df_result.to_csv("output.csv", index=False)

In [None]:
# from sklearn.metrics import f1_score

# y_test = df_clean["label"]
# f1 = f1_score(y_test, y_pred, average="binary")
# f1