In [1]:
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import re

In [2]:
df = pd.read_csv("output_chunk_1.csv")

print(df.head(5))

  category                                               text
0     arts  ktxl community leaders and activists gathered ...
1     arts  hate crimes against asian americans and pacifi...
2     arts  people attend a vigil in solidarity with the a...
3     arts  explainer why georgia attack spurs fears in as...
4     arts  international pop star rihanna today expressed...


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7200 entries, 0 to 7199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  7200 non-null   object
 1   text      7200 non-null   object
dtypes: object(2)
memory usage: 112.6+ KB
None


In [4]:
print(df['category'].value_counts())

category
arts             400
crime            400
unrest           400
sport            400
social           400
science          400
religion         400
politics         400
other            400
lifestyle        400
labour           400
humanInterest    400
health           400
environmental    400
education        400
economy          400
disaster         400
weather          400
Name: count, dtype: int64


In [5]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sagar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sagar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(r'\s+',' ',text).strip()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

In [7]:
df['cleaned_text'] = df["text"].apply(preprocess_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['category'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [13]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)

In [15]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.7979166666666667
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        85
           1       0.84      0.88      0.86        86
           2       0.84      0.79      0.82        78
           3       0.72      0.79      0.76        86
           4       0.84      0.84      0.84        79
           5       0.80      0.73      0.76        73
           6       0.72      0.65      0.68        81
           7       0.88      0.86      0.87        78
           8       0.79      0.88      0.83        72
           9       0.77      0.90      0.83        90
          10       0.51      0.38      0.44        71
          11       0.91      0.94      0.92        83
          12       0.69      0.65      0.67        86
          13       0.74      0.68      0.70        74
          14       0.85      0.91      0.88        80
          15       0.81      0.89      0.85        98
          16       0.84      0.82      0.83        

##### Accuracy score is 79%, which is less than the required 85%. So going to use deep learning model BERT.

#### Install BERT

In [16]:
import torch
from transformers import AutoTokenizer

In [17]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from scipy.sparse import issparse

# Ensure input data is raw text
def ensure_text_format(data):
    """Ensure all elements in the dataset are properly formatted as strings."""
    if issparse(data):
        data = data.toarray()  # Convert sparse matrix to dense array
    if isinstance(data, np.ndarray):
        data = data.tolist()  # Convert NumPy array to list
    return [str(item) for item in data]  # Ensure everything is a string


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure all text is correctly formatted
X_train, X_test = ensure_text_format(X_train), ensure_text_format(X_test)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):  
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],  # Ensure this is a string
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create dataset objects
train_dataset = CustomDataset(X_train, y_train, tokenizer)
test_dataset = CustomDataset(X_test, y_test, tokenizer)

# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(y_train)))

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True
)

# Define Evaluation Metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the Model
trainer.train()

# Evaluate the Model
results = trainer.evaluate()
print("Evaluation Results:", results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
