# Basic Algorithms trained on WELFAKE dataset

In [9]:
import os
import re
import json
import joblib
import pandas as pd
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
import string


In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
#nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/vlad.cristescu/nltk_data...


True

In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def wordopt(text):
  text=text.lower()
  text=re.sub('\[.*?\]','',text)
  text = re.sub(r"https?://\S+|www\.\S+", '', text)
  text = re.sub(r"<.*?>+", '', text)
  text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub(r'\n', '', text)
  text = re.sub(r'\w*\d\w*', '', text)  # eliminare cuvinte cu cifre
  text = re.sub(r'\s+', ' ', text).strip()  
  text = re.sub(r'[“”‘’]', '', text)  


  text = " ".join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)

  return text

def train_and_evaluate(vec_name, vectorizer, clf_name, clf, X_train, X_test, y_train, y_test, save_dir):
    """
    Construiește un pipeline (vectorizator + clasificator), îl antrenează, evaluează performanța
    și salvează modelul cu numele {Classifier}_{Vectorizer}.joblib.
    """
    pipe = make_pipeline(vectorizer, clf)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    filename = f"{clf_name}_{vec_name}.joblib".replace(" ", "_")
    model_path = os.path.join(save_dir, filename)
    joblib.dump(pipe, model_path)
    print(f"Saved model: {model_path} | Accuracy: {acc:.4f}")
    
    return {
        "vectorizer": vec_name,
        "classifier": clf_name,
        "accuracy": acc,
        "report": report,
        "model_path": model_path
    }

In [3]:
save_dir = "../saved_models"
os.makedirs(save_dir, exist_ok=True)

In [25]:

data = pd.read_csv("../datasets/WELFake_Dataset.csv")

In [26]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [29]:
data.loc[3,"text"]

'A dozen politically active pastors came here for a private dinner Friday night to hear a conversion story unique in the context of presidential politics: how Louisiana Gov. Bobby Jindal traveled from Hinduism to Protestant Christianity and, ultimately, became what he calls an “evangelical Catholic.”\n\nOver two hours, Jindal, 42, recalled talking with a girl in high school who wanted to “save my soul,” reading the Bible in a closet so his parents would not see him and feeling a stir while watching a movie during his senior year that depicted Jesus on the cross.\n\n“I was struck, and struck hard,” Jindal told the pastors. “This was the Son of God, and He had died for our sins.”\n\nJindal’s session with the Christian clergy, who lead congregations in the early presidential battleground states of Iowa and South Carolina, was part of a behind-the-scenes effort by the Louisiana governor to find a political base that could help propel him into the top tier of Republican candidates seeking t

In [5]:
print(data.shape)
data = data.dropna(subset=['text'])
data.shape

(72134, 4)


(72095, 4)

In [16]:
data['text'] = data['text'].apply(wordopt)

In [18]:
X = data['text'].values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
vectorizers = {
    "Bag_of_Words_(1-3gram)": CountVectorizer(ngram_range=(1, 3)),
    "TFIDF_(1-3gram)": TfidfVectorizer(ngram_range=(1, 3))
}

classifiers = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "NaiveBayes": MultinomialNB(),
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
}

In [21]:
save_dir = "../saved_models"
os.makedirs(save_dir, exist_ok=True)

In [22]:
tasks = []
for vec_name, vectorizer in vectorizers.items():
    for clf_name, clf in classifiers.items():
        tasks.append((vec_name, vectorizer, clf_name, clf))


In [23]:
results = Parallel(n_jobs=-1)(
    delayed(train_and_evaluate)(vec_name, vectorizer, clf_name, clf,
                                X_train, X_test, y_train, y_test, save_dir)
    for vec_name, vectorizer, clf_name, clf in tasks
)

Saved model: ../saved_models/NaiveBayes_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.9398
Saved model: ../saved_models/NaiveBayes_TFIDF_(1-3gram).joblib | Accuracy: 0.9109
Saved model: ../saved_models/KNN_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.6699
Saved model: ../saved_models/KNN_TFIDF_(1-3gram).joblib | Accuracy: 0.5180
Saved model: ../saved_models/LogisticRegression_TFIDF_(1-3gram).joblib | Accuracy: 0.9465
Saved model: ../saved_models/RandomForest_TFIDF_(1-3gram).joblib | Accuracy: 0.9250
Saved model: ../saved_models/RandomForest_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.9348
Saved model: ../saved_models/LogisticRegression_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.9641
Saved model: ../saved_models/SVM_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.9530
Saved model: ../saved_models/SVM_TFIDF_(1-3gram).joblib | Accuracy: 0.9601


In [None]:
vec_name = "TFIDF_(1-3gram)"
clf_name = "SVM"
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
clf = SVC(random_state=42)

save_dir = "saved_models"

train_and_evaluate(vec_name, vectorizer, clf_name, clf,
                   X_train, X_test, y_train, y_test, save_dir)

In [24]:
results_summary = {"results": results}
results_file = os.path.join(save_dir, "results_summary.json")
with open(results_file, "w") as f:
    json.dump(results_summary, f, indent=4)
print(f"Results summary saved to {results_file}")

Results summary saved to ../saved_models/results_summary.json
