In [36]:
# Install required libraries: Hugging Face Transformers, Datasets, and Scikit-learn for model building, dataset handling, and evaluation
!pip install transformers datasets scikit-learn --quiet

In [37]:
# Importing all the necessary libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

In [38]:
# Load it
df = pd.read_csv("/content/drive/MyDrive/Fact-checker-ML-dataset/politifact_factcheck_data.csv")

# Displaying the first 5 rows
df.head()

Unnamed: 0,verdict,statement_originator,statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link
0,true,Barack Obama,John McCain opposed bankruptcy protections for...,6/11/2008,speech,Adriel Bettelheim,6/16/2008,https://www.politifact.com/factchecks/2008/jun...
1,false,Matt Gaetz,"""Bennie Thompson actively cheer-led riots in t...",6/7/2022,television,Yacob Reyes,6/13/2022,https://www.politifact.com/factchecks/2022/jun...
2,mostly-true,Kelly Ayotte,"Says Maggie Hassan was ""out of state on 30 day...",5/18/2016,news,Clay Wirestone,5/27/2016,https://www.politifact.com/factchecks/2016/may...
3,false,Bloggers,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",2/1/2021,blog,Madison Czopek,2/5/2021,https://www.politifact.com/factchecks/2021/feb...
4,half-true,Bobby Jindal,"""I'm the only (Republican) candidate that has ...",8/30/2015,television,Linda Qiu,8/30/2015,https://www.politifact.com/factchecks/2015/aug...


In [39]:
# Displays all the column names and the data types of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21152 entries, 0 to 21151
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   verdict                  21152 non-null  object
 1   statement_originator     21152 non-null  object
 2   statement                21152 non-null  object
 3   statement_date           21152 non-null  object
 4   statement_source         21152 non-null  object
 5   factchecker              21131 non-null  object
 6   factcheck_date           21152 non-null  object
 7   factcheck_analysis_link  21152 non-null  object
dtypes: object(8)
memory usage: 1.3+ MB


In [40]:
# Normalize verdicts
df["verdict"] = df["verdict"].str.strip().str.lower()

In [41]:
print(df['verdict'].unique())

['true' 'false' 'mostly-true' 'half-true' 'pants-fire' 'mostly-false']


In [42]:
# Drop missing values
df = df.dropna(subset=["statement", "verdict", "factchecker"])

# Map verdicts to binary
verdict_map = {
    "true": "true",
    "mostly-true": "true",
    "half-true": "true",
    "mostly-false": "false",
    "false": "false",
    "pants-fire": "false"
}

df = df[df["verdict"].isin(verdict_map.keys())]
df["verdict"] = df["verdict"].map(verdict_map)

# Format for T5
df["input_text"] = "classify: " + df["statement"]
df["target_text"] = df["verdict"]

# Keep only needed columns
df_t5 = df[["input_text", "target_text"]]
df_t5.head()

Unnamed: 0,input_text,target_text
0,classify: John McCain opposed bankruptcy prote...,True
1,"classify: ""Bennie Thompson actively cheer-led ...",False
2,"classify: Says Maggie Hassan was ""out of state...",True
3,"classify: ""BUSTED: CDC Inflated COVID Numbers,...",False
4,"classify: ""I'm the only (Republican) candidate...",True


In [43]:
# Displays all the column names and the data types of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21131 entries, 0 to 21151
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   verdict                  21131 non-null  object
 1   statement_originator     21131 non-null  object
 2   statement                21131 non-null  object
 3   statement_date           21131 non-null  object
 4   statement_source         21131 non-null  object
 5   factchecker              21131 non-null  object
 6   factcheck_date           21131 non-null  object
 7   factcheck_analysis_link  21131 non-null  object
 8   input_text               21131 non-null  object
 9   target_text              21131 non-null  object
dtypes: object(10)
memory usage: 1.8+ MB


In [44]:
# Checking distribution
print(df['target_text'].value_counts())

target_text
false    11756
true      9375
Name: count, dtype: int64


In [45]:
class_weights = class_weight.compute_sample_weight('balanced', df['target_text'])

In [46]:
# Split into initial train (80%) and temporary (20%)
train_df, temp_df = train_test_split(
    df_t5,
    test_size=0.2,
    stratify=df_t5["target_text"],
    random_state=42
)

# Split temporary into validation (10%) and test (10%)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["target_text"],
    random_state=42
)

# Base directory path in Google Drive
base_save_path = "/content/drive/MyDrive/test-train-val-new-fact-cheker-ml-datasets/"

# Create separate directories for each dataset
train_dir = os.path.join(base_save_path, "train")
val_dir = os.path.join(base_save_path, "validation")
test_dir = os.path.join(base_save_path, "test")

# Ensure directories exist (create if not)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Save datasets in their respective directories
train_df.to_csv(os.path.join(train_dir, "train_data.csv"), index=False)
val_df.to_csv(os.path.join(val_dir, "validation_data.csv"), index=False)
test_df.to_csv(os.path.join(test_dir, "test_data.csv"), index=False)

print("Saved the datasets in separate directories:")
print(f"Train: {train_dir}/train_data.csv")
print(f"Validation: {val_dir}/validation_data.csv")
print(f"Test: {test_dir}/test_data.csv")

Saved the datasets in separate directories:
Train: /content/drive/MyDrive/test-train-val-new-fact-cheker-ml-datasets/train/train_data.csv
Validation: /content/drive/MyDrive/test-train-val-new-fact-cheker-ml-datasets/validation/validation_data.csv
Test: /content/drive/MyDrive/test-train-val-new-fact-cheker-ml-datasets/test/test_data.csv
