# Basic Algorithms trained on Combined Corpus dataset

In [1]:
import os
import re
import json
import joblib
import pandas as pd
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
import string


In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vlad.cristescu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /home/vlad.cristescu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/vlad.cristescu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/vlad.cristescu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def wordopt(text):
  text=text.lower()
  text=re.sub('\[.*?\]','',text)
  text = re.sub(r"https?://\S+|www\.\S+", '', text)
  text = re.sub(r"<.*?>+", '', text)
  text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub(r'\n', '', text)
  text = re.sub(r'\w*\d\w*', '', text)  # eliminare cuvinte cu cifre
  text = re.sub(r'\s+', ' ', text).strip() 
  text = re.sub(r'[“”‘’]', '', text)  


  text = " ".join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)

  return text

def train_and_evaluate(vec_name, vectorizer, clf_name, clf, X_train, X_test, y_train, y_test, save_dir):
   
    pipe = make_pipeline(vectorizer, clf)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    filename = f"{clf_name}_{vec_name}.joblib".replace(" ", "_")
    model_path = os.path.join(save_dir, filename)
    joblib.dump(pipe, model_path)
    print(f"Saved model: {model_path} | Accuracy: {acc:.4f}")
    
    return {
        "vectorizer": vec_name,
        "classifier": clf_name,
        "accuracy": acc,
        "report": report,
        "model_path": model_path
    }

In [5]:
save_dir = "../saved_models"
os.makedirs(save_dir, exist_ok=True)

In [6]:

data = pd.read_csv("../../datasets/Combined_Corpus/All.csv")

In [7]:
data.head()

Unnamed: 0,Statement,Label,Statement_length,word_count,char_count,avg_word_length
0,More Than 100 Million Americans Are On Welfare...,0,4669,833,3836,4.605042
1,"There is no commitment to provide players, and...",1,2291,387,1904,4.919897
2,Does Uptick in Mysterious Booms Foretell Mega-...,0,8810,1450,7360,5.075862
3,Rand Paul Exposes the Crony Federal Reserve on...,0,73,12,61,5.083333
4,"Andrew S. Grove, the longtime chief executive ...",1,12473,1980,10494,5.3


In [8]:
print(data.shape)
data = data[data['word_count'] >= 30]
print(data.shape)

(86531, 6)
(82540, 6)


In [10]:
data['Statement'] = data['Statement'].apply(wordopt)

In [11]:
X = data['Statement'].values
y = data['Label'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [18]:
vectorizers = {
    "Bag_of_Words_(1-3gram)": CountVectorizer(ngram_range=(1, 3),max_features=20000),
    "TFIDF_(1-3gram)": TfidfVectorizer(ngram_range=(1, 3),max_features=20000)
}

classifiers = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "NaiveBayes": MultinomialNB(),
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
}

In [19]:
tasks = []
for vec_name, vectorizer in vectorizers.items():
    for clf_name, clf in classifiers.items():
        tasks.append((vec_name, vectorizer, clf_name, clf))


In [21]:
results = Parallel(n_jobs=4)(
    delayed(train_and_evaluate)(vec_name, vectorizer, clf_name, clf,
                                X_train, X_test, y_train, y_test, save_dir)
    for vec_name, vectorizer, clf_name, clf in tasks
)

Saved model: ../saved_models/NaiveBayes_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.8976
Saved model: ../saved_models/LogisticRegression_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.9771
Saved model: ../saved_models/RandomForest_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.9653
Saved model: ../saved_models/RandomForest_TFIDF_(1-3gram).joblib | Accuracy: 0.9654




Saved model: ../saved_models/KNN_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.7463
Saved model: ../saved_models/LogisticRegression_TFIDF_(1-3gram).joblib | Accuracy: 0.9758
Saved model: ../saved_models/NaiveBayes_TFIDF_(1-3gram).joblib | Accuracy: 0.9466
Saved model: ../saved_models/KNN_TFIDF_(1-3gram).joblib | Accuracy: 0.8493
Saved model: ../saved_models/SVM_Bag_of_Words_(1-3gram).joblib | Accuracy: 0.9627
Saved model: ../saved_models/SVM_TFIDF_(1-3gram).joblib | Accuracy: 0.9814


In [None]:
# vec_name = "TFIDF_(1-3gram)"
# clf_name = "SVM"
# vectorizer = TfidfVectorizer(ngram_range=(1, 3))
# clf = SVC(random_state=42)

# # Directorul de salvare
# save_dir = "saved_models"

# train_and_evaluate(vec_name, vectorizer, clf_name, clf,
#                    X_train, X_test, y_train, y_test, save_dir)

In [22]:
results_summary = {"results": results}
results_file = os.path.join(save_dir, "results_summary.json")
with open(results_file, "w") as f:
    json.dump(results_summary, f, indent=4)
print(f"Results summary saved to {results_file}")

Results summary saved to ../saved_models/results_summary.json
