In [15]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset

In [5]:
df1 = pd.read_csv("News-Classification-Dataset/barely-true-cleaned.csv")
df2 = pd.read_csv("News-Classification-Dataset/false-cleaned.csv")
df3 = pd.read_csv("News-Classification-Dataset/half-true-cleaned.csv")
df4 = pd.read_csv("News-Classification-Dataset/mostly-true-cleaned.csv")
df5 = pd.read_csv("News-Classification-Dataset/pants-fire-cleaned.csv")
df6 = pd.read_csv("News-Classification-Dataset/true-cleaned.csv")


In [6]:
df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)

## Checking the data for imbalances

- Class imbalances can be a serious problem for machine learning models.

If there are class imbalances, the model may not learn to predict the minority class well. This can be solves using 
- oversampling
- Weighted classes
- SMOTE (Synthetic Minority Over-sampling Technique)

In [7]:
# Basic overview of the dataset
print(f"Dataset shape: {df.shape}")
df.info()

Dataset shape: (15000, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Statement  15000 non-null  object
 1   Label      15000 non-null  object
dtypes: object(2)
memory usage: 234.5+ KB


In [8]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])  # Only show columns with missing values



Missing values per column:
Series([], dtype: int64)


In [9]:
# Check class distribution 
class_count = df['Label'].value_counts()
print("\nClass count (%):")
print(class_count)


Class count (%):
Label
barelytrue    2500
False         2500
halftrue      2500
mostlytrue    2500
pantsfire     2500
True          2500
Name: count, dtype: int64


## Tokenization

- Padding and Truncation are very crucial for the model to work properly.
- Padding is used to make all sequences the same length.
- Truncation is used to cut off sequences that are too long.
- The tokenizer will automatically pad and truncate the sequences to the maximum length of the model.
- The tokenizer will also convert the text to input IDs and attention masks.

In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

## Sample text and tokenization
sample_text = df['Statement'].iloc[0]
print("\nSample text:")
print(sample_text)
print("\nTokenized sample text:")
sample_tokens = tokenizer(sample_text, truncation=True, padding='max_length', max_length=128)
print(sample_tokens)


Sample text:
Every study has shown that when work requirements are tied to federal safetynet programs it puts more people to work.

Tokenized sample text:
{'input_ids': [101, 2296, 2817, 2038, 3491, 2008, 2043, 2147, 5918, 2024, 5079, 2000, 2976, 3808, 7159, 3454, 2009, 8509, 2062, 2111, 2000, 2147, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Model
- The model is a standard BERT model that is pre-trained.

Choosing the right model is very crucial
- distilbert is used for systems for low latency and high throughput.
- bert-base-uncased is used for systems that require high accuracy and can afford to run slower.
- bert-large-uncased is used for systems that require very high accuracy and can afford to run slower.

The performace difference between the models depends on the task and the dataset. But in general, the larger the model, the better the performance.
- But distilbert usually performs within a few percentage points of bert-base-uncased and is much faster.

In [16]:
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

print(model.config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.50.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



## Label mapping

In [21]:
label_to_id = {label: id for id, label in enumerate(df['Label'].unique())}
print("\nLabel to ID mapping: ", end="")
print(label_to_id)

id_to_label = {id: label for label, id in label_to_id.items()}
print("\nID to Label mapping: ", end="")
print(id_to_label)


Label to ID mapping: {'barelytrue': 0, False: 1, 'halftrue': 2, 'mostlytrue': 3, 'pantsfire': 4, True: 5}

ID to Label mapping: {0: 'barelytrue', 1: False, 2: 'halftrue', 3: 'mostlytrue', 4: 'pantsfire', 5: True}


## Freezing layers

- Freezing layers is a technique used to prevent the model from updating the weights of certain layers during training.
- This is useful when you want to fine-tune a pre-trained model on a new task.
- Freezing layers can help to prevent overfitting and speed up training.