In [14]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('train_liar.csv')
val_df = pd.read_csv('val_liar.csv')
test_df = pd.read_csv('test_liar.csv')

In [15]:
train_df.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job_title',
       'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
       'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts',
       'context', 'statement_len', 'polarity', 'subjectivity', 'label_num'],
      dtype='object')

In [16]:
if np.array_equal(train_df.columns, val_df.columns) and np.array_equal(val_df.columns, test_df.columns):
    print("All datasets have the same columns in the correct order")
else:
    print("Column mismatch detected")


All datasets have the same columns in the correct order


In [17]:
text_feature = 'statement'

numeric_features = [
    'statement_len', 'polarity', 'subjectivity',
    'barely_true_counts', 'false_counts', 'half_true_counts',
    'mostly_true_counts', 'pants_on_fire_counts'
]

categorical_features = [
    'subject', 'speaker', 'speaker_job_title',
    'state_info', 'party_affiliation', 'context'
]

target = 'label_num'

In [18]:
X = train_df[[text_feature] + numeric_features + categorical_features]
y = train_df[target]

In [19]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
from scipy.stats import randint

In [None]:
X = train_df[[text_feature] + numeric_features + categorical_features]
y = train_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

text_transformer = TfidfVectorizer(max_features=5000)


preprocessor = ColumnTransformer(transformers=[
    ('text', text_transformer, text_feature),
    ('num', num_transformer, numeric_features),
    ('cat', cat_transformer, categorical_features)
])


pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', HistGradientBoostingClassifier())
])


param_grid = {
    'clf__max_iter': randint(100, 1000),
    'clf__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'clf__max_leaf_nodes': [15, 31, 63, 127],
    'clf__max_depth': [3, 5, 7, None],
    'clf__l2_regularization': [0.0, 0.1, 1.0],
}


search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=20,
    scoring='f1_weighted',
    cv=3,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)


best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nBest Hyperparameters:\n", search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
