In [18]:
!pip install nltk



In [19]:
import pandas as pd
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import re

In [20]:
df = pd.read_csv("output_chunk_1.csv")

print(df.head(5))

  category                                               text
0     arts  ktxl community leaders and activists gathered ...
1     arts  hate crimes against asian americans and pacifi...
2     arts  people attend a vigil in solidarity with the a...
3     arts  explainer why georgia attack spurs fears in as...
4     arts  international pop star rihanna today expressed...


In [21]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7200 entries, 0 to 7199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  7200 non-null   object
 1   text      7200 non-null   object
dtypes: object(2)
memory usage: 112.6+ KB
None


In [22]:
print(df['category'].value_counts())

category
arts             400
crime            400
unrest           400
sport            400
social           400
science          400
religion         400
politics         400
other            400
lifestyle        400
labour           400
humanInterest    400
health           400
environmental    400
education        400
economy          400
disaster         400
weather          400
Name: count, dtype: int64


In [23]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(r'\s+',' ',text).strip()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

In [25]:
df['cleaned_text'] = df["text"].apply(preprocess_text)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])

In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['category'])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [33]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

In [35]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.7979166666666667
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        85
           1       0.84      0.88      0.86        86
           2       0.84      0.79      0.82        78
           3       0.72      0.79      0.76        86
           4       0.84      0.84      0.84        79
           5       0.80      0.73      0.76        73
           6       0.72      0.65      0.68        81
           7       0.88      0.86      0.87        78
           8       0.79      0.88      0.83        72
           9       0.77      0.90      0.83        90
          10       0.51      0.38      0.44        71
          11       0.91      0.94      0.92        83
          12       0.69      0.65      0.67        86
          13       0.74      0.68      0.70        74
          14       0.85      0.91      0.88        80
          15       0.81      0.89      0.85        98
          16       0.84      0.82      0.83        

##### Accuracy score is 79%, which is less than the required 85%. So going to use deep learning model BERT.

#### Install BERT

In [37]:
!pip install transformers datasets torch scikit-learn 

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp39-cp39-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.11-cp39-cp39-win_amd64.whl.metadata (8.0 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.4-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting async-timeout<6.0,>=4.0 (from aiohttp->datasets)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->datasets)
  Downloading frozenl

In [38]:
import torch
from transformers import AutoTokenizer

In [39]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["text"].tolist(), df["category"].tolist(), test_size = 0.2, random_state = 42)

In [40]:
train_labels = le.fit_transform(train_labels)
test_labels = le.fit_transform(test_labels)

In [42]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [43]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)