In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from nltk.corpus import stopwords
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk

In [2]:
# Load the AG News dataset
df = pd.read_csv("https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv", header=None)
df.columns = ["label", "title", "description"]
df

Unnamed: 0,label,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


In [None]:
#download the stopwords
nltk.download('stopwords')

# Preprocess the text data
# Remove stop words and punctuations from text data
stop_words = set(stopwords.words('english'))
df["title"] = df["title"].apply(lambda x: " ".join([word.lower() for word in x.split() if word.lower() not in stop_words]))

df["description"] = df["description"].apply(lambda x: " ".join([word.lower() for word in x.split() if word.lower() not in stop_words]))

df["text"] = df["title"] + " " + df["description"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Tokenize the text data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

# Convert the labels to categorical variables
num_classes = len(np.unique(y_train))
y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes)

# Fine-tune the BERT model on the classification task
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
history = model.fit(train_encodings, y_train_cat, epochs=3, batch_size=32, validation_data=(test_encodings, y_test_cat))

# Evaluate the performance of the model
y_pred = np.argmax(model.predict(test_encodings)[0], axis=1)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))

# Use the trained model to predict the categories of a few samples from the test set
sample_texts = ["The US economy grew by 2% in the last quarter", 
                "The World Cup is coming to Qatar in 2022", 
                "Scientists have discovered a new planet in the solar system", 
                "The stock market is crashing due to the pandemic"]
sample_encodings = tokenizer(list(sample_texts), truncation=True, padding=True)
sample_pred = np.argmax(model.predict(sample_encodings)[0], axis=1)
print("Sample predictions:", sample_pred)
