In [14]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('train_liar.csv')
val_df = pd.read_csv('val_liar.csv')
test_df = pd.read_csv('test_liar.csv')

In [9]:
train_df.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job_title',
       'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',
       'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts',
       'context', 'statement_len', 'polarity', 'subjectivity', 'label_num'],
      dtype='object')

In [10]:
if np.array_equal(train_df.columns, val_df.columns) and np.array_equal(val_df.columns, test_df.columns):
    print("All datasets have the same columns in the correct order")
else:
    print("Column mismatch detected")


All datasets have the same columns in the correct order


In [11]:
text_feature = 'statement'

numeric_features = [
    'statement_len', 'polarity', 'subjectivity',
    'barely_true_counts', 'false_counts', 'half_true_counts',
    'mostly_true_counts', 'pants_on_fire_counts'
]

categorical_features = [
    'subject', 'speaker', 'speaker_job_title',
    'state_info', 'party_affiliation', 'context'
]

target = 'label_num'

In [12]:
X = train_df[[text_feature] + numeric_features + categorical_features]
y = train_df[target]

In [15]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.stats import randint

In [16]:
text_transformer = TfidfVectorizer(max_features=1000, stop_words='english')

num_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

cat_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

preprocessor = ColumnTransformer(transformers=[
    ('text', text_transformer, text_feature),
    ('num', num_transformer, numeric_features),
    ('cat', cat_transformer, categorical_features)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_jobs=-1, random_state=42)
)
pipeline.set_params(memory='cache_dir')

param_grid = {
    'randomforestclassifier__n_estimators': randint(50, 200),
    'randomforestclassifier__max_depth': [None, 10, 20, 30],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4]
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=5,
    cv=2,
    scoring='f1_weighted',
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)

y_pred = search.best_estimator_.predict(X_test)
print("\nBest Parameters:\n", search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 2 folds for each of 5 candidates, totalling 10 fits

Best Parameters:
 {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_leaf': 4, 'randomforestclassifier__min_samples_split': 10, 'randomforestclassifier__n_estimators': 139}

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.22      0.31       148
           1       0.41      0.61      0.49       401
           2       0.65      0.25      0.36       339
           3       0.36      0.57      0.44       438
           4       0.44      0.52      0.48       382
           5       0.68      0.23      0.34       340

    accuracy                           0.43      2048
   macro avg       0.51      0.40      0.40      2048
weighted avg       0.50      0.43      0.42      2048



In [17]:
y_pred = search.best_estimator_.predict(X_test)
print("\nBest Parameters:\n", search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Best Parameters:
 {'randomforestclassifier__max_depth': None, 'randomforestclassifier__min_samples_leaf': 4, 'randomforestclassifier__min_samples_split': 10, 'randomforestclassifier__n_estimators': 139}

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.22      0.31       148
           1       0.41      0.61      0.49       401
           2       0.65      0.25      0.36       339
           3       0.36      0.57      0.44       438
           4       0.44      0.52      0.48       382
           5       0.68      0.23      0.34       340

    accuracy                           0.43      2048
   macro avg       0.51      0.40      0.40      2048
weighted avg       0.50      0.43      0.42      2048

