In [None]:
import pandas as pd

# Load the dataset
dataset_path = '/content/train_data_mal_fake_detect.csv'
df = pd.read_csv(dataset_path)

df.head(), df.shape

(          ID                                               News       Label
 0  FAKE_1001  കേള്‍വി തകരാറുള്ള കുട്ടികള്‍ക്ക് നടത്തുന്ന സൗജ...  FALSE     
 1  FAKE_1002  ചന്ദ്രയാന് കേരള മുഖ്യമന്ത്രി പിണറായി വിജയൻ മാത...  FALSE     
 2  FAKE_1003  പിണറായി വിജയന്‍ സര്‍ക്കാര്‍ നിര്‍മിച്ച കേരളത്ത...  FALSE     
 3  FAKE_1004  വിഴിഞ്ഞത്ത് തീരദേശവാസികള്‍ ആക്രമിച്ചപ്പോള്‍ മു...  FALSE     
 4  FAKE_1005  കുരിശിന് മുന്നില്‍ കൈകൂപ്പി നില്‍ക്കുന്ന പിണറാ...  FALSE     ,
 (1669, 3))

In [None]:
df['Label'] = df['Label'].str.strip()

# Now, 'df['Label'].unique()' should give you the cleaned, unique values:
unique_labels = df['Label'].unique()
print(unique_labels)

# Proceed with encoding the labels. Here's a simple approach for multi-class classification:
label_mapping = {
    'FALSE': 0,
    'HALF TRUE': 1,
    'MOSTLY FALSE': 2,
    'PARTLY FALSE': 3,
    'MOSTLY TRUE': 4
}

# Apply the mapping to convert text labels to integers
df['Label'] = df['Label'].map(label_mapping)

# Check the first few rows to confirm changes
print(df.head())


['FALSE' 'HALF TRUE' 'MOSTLY FALSE' 'PARTLY FALSE' 'MOSTLY TRUE']
          ID                                               News  Label
0  FAKE_1001  കേള്‍വി തകരാറുള്ള കുട്ടികള്‍ക്ക് നടത്തുന്ന സൗജ...      0
1  FAKE_1002  ചന്ദ്രയാന് കേരള മുഖ്യമന്ത്രി പിണറായി വിജയൻ മാത...      0
2  FAKE_1003  പിണറായി വിജയന്‍ സര്‍ക്കാര്‍ നിര്‍മിച്ച കേരളത്ത...      0
3  FAKE_1004  വിഴിഞ്ഞത്ത് തീരദേശവാസികള്‍ ആക്രമിച്ചപ്പോള്‍ മു...      0
4  FAKE_1005  കുരിശിന് മുന്നില്‍ കൈകൂപ്പി നില്‍ക്കുന്ന പിണറാ...      0


In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['News'], df['Label'], test_size=0.2, random_state=42)


In [None]:
train_texts = train_texts.astype(str).tolist()
val_texts = val_texts.astype(str).tolist()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')


In [None]:

# Tokenize the text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="tf")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="tf")


In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Set the batch size
batch_size = 16
train_dataset = train_dataset.shuffle(len(train_texts)).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)


In [None]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

# Prepare the model for training
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

In [None]:
model_path = '/content/indicBert'

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
# Load the dataset
test_df = pd.read_csv('/content/FakeNews - Test Dataset2.csv')

In [None]:
test_df['FactCheck'] = test_df['FactCheck'].str.strip()

# Now, 'df['Label'].unique()' should give you the cleaned, unique values:
unique_labels = test_df['FactCheck'].unique()
print(unique_labels)

In [None]:
# Define a mapping based on the provided labels
test_label_mapping = {
    'False news': 'FALSE',
    'PARTLY False news': 'PARTLY FALSE',
    'MOSTLY False news': 'MOSTLY FALSE',
    'HALF TRUE': 'HALF TRUE',
    'Mostly False news': 'MOSTLY FALSE',
    'Half true': 'HALF TRUE',
    'Partly False news': 'PARTLY FALSE'
}

# Apply this mapping to your test labels to harmonize them with the training labels
test_df['FactCheck'] = test_df['FactCheck'].map(test_label_mapping)


In [None]:
test_labels_encoded = test_df['FactCheck'].map(label_mapping).tolist()

In [None]:
test_texts = test_df['News'].astype(str).tolist()

In [None]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors="tf")


In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

test_dataset = test_dataset.batch(batch_size)  # Use the same batch size as during training


In [None]:
from numpy import argmax

# Make predictions
pred = model.predict(test_dataset)
pred_labels = argmax(pred.logits, axis=1)



In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(test_labels_encoded, pred_labels)


In [None]:
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 60.2
