# Preprocessing

In [95]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


df_true = pd.read_csv('data/True.csv')
df_false = pd.read_csv('data/Fake.csv')


# Create a 'label' column indicating true labels as 1 and false labels as 0
df_true['label'] = 1
df_false['label'] = 0

# Concatenate the two datasets
df_combined = pd.concat([df_true, df_false], ignore_index=True)


# label encode the subject column so that it can be used for the models
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(df_combined['subject'])

df_combined['new_subject'] = label_encoded
X = df_combined[['title', 'text', 'new_subject']]
y = df_combined['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # limiting to 1000 so that it can run
X_train_text_tfidf = tfidf_vectorizer.fit_transform(X_train['title'] + ' ' + X_train['text'])
X_test_text_tfidf = tfidf_vectorizer.transform(X_test['title'] + ' ' + X_test['text'])

X_train_combined = pd.DataFrame(X_train_text_tfidf.toarray())
X_test_combined = pd.DataFrame(X_test_text_tfidf.toarray())




# XGBoost

In [96]:

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}




xgb_grid_search = GridSearchCV(XGBClassifier(random_state=42, enable_categorical=True),
                               xgb_param_grid, cv=5, scoring='f1')


xgb_grid_search.fit(X_train_combined, y_train)


best_xgb_params = xgb_grid_search.best_params_


best_xgb_model = xgb_grid_search.best_estimator_


print("Best XGBoost Parameters:", best_xgb_params)

xgb_predictions = best_xgb_model.predict(X_test_combined)


xgb_f1 = f1_score(y_test, xgb_predictions)
print("XGBoost F1 Score:", xgb_f1)


Best XGBoost Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
XGBoost F1 Score: 0.9971128305808985


# Decision Tree

In [97]:
from sklearn.tree import DecisionTreeClassifier


dt_param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}



dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5, scoring='f1')

dt_grid_search.fit(X_train_combined, y_train)

best_dt_params = dt_grid_search.best_params_

best_dt_model = dt_grid_search.best_estimator_

print("Best Decision Tree Parameters:", best_dt_params)

dt_predictions = best_dt_model.predict(X_test_combined)


dt_f1 = f1_score(y_test, dt_predictions)
print("Decision Tree F1 Score:", dt_f1)


Best Decision Tree Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
Decision Tree F1 Score: 0.9960766212785599
