In [1]:
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

import pickle

In [2]:
# reading dataset
dataset = pd.read_csv('./datasets/nyt_article_data.csv', sep='\t')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     360 non-null    object
 1   Snippet   360 non-null    object
 2   URL       360 non-null    object
 3   Category  360 non-null    object
dtypes: object(4)
memory usage: 11.4+ KB


In [4]:
# shuffle the dataset for each category to have atleast 200 rows
dataset = dataset.groupby('Category').apply(lambda x: x.sample(n=120, random_state=42)).sample(frac=1, random_state=42).reset_index(drop=True)

  dataset = dataset.groupby('Category').apply(lambda x: x.sample(n=120, random_state=42)).sample(frac=1, random_state=42).reset_index(drop=True)


In [5]:
# check for row count based on category
dataset.groupby('Category').nunique()

Unnamed: 0_level_0,Title,Snippet,URL
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Business,120,120,120
Health,120,120,120
Politics,120,120,120


In [6]:
# replacing the non-aplhabetical characters with whitespace
dataset['text'] = dataset['Title'].str.replace('[^a-zA-Z]',' ')

In [7]:
# converting the word into smaller words
dataset['text'] = [word.lower() for word in dataset['text']]

In [8]:
# applying tokenization
dataset['text'] = dataset['text'].apply(nltk.tokenize.WhitespaceTokenizer().tokenize)

In [9]:
# removing stopwords
dataset['text'] = dataset['text'].apply(lambda words: [word for word in words if not word in stopwords.words('english')])

In [10]:
stemmer = PorterStemmer()

def stem_text(text):
    # Stem the words from the text
    return (stemmer.stem(word) for word in text)

# Applying stemming
dataset['text'] = dataset['text'].apply(stem_text)

In [11]:
dataset['stemmed_text'] = 0
for i in range(0, len(dataset)):
    dataset['stemmed_text'][i] = ' '.join(dataset['text'][i])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataset['stemmed_text'][i] = ' '.join(dataset['text'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datas

In [12]:
train_data = dataset['stemmed_text'].values
target = dataset['Category'].values

In [13]:
# converting data into numerical values
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data)
Y_train = target

In [14]:
# training data into MultinomialNB
model = MultinomialNB()
model.fit(X_train, Y_train)

In [15]:
# prediction:
input_text = input('Enter text to classify: ')

# function to tokenize and lemmatize the user input
def text_stemmer(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # Replace non-alphabetic characters with spaces
    lower_text = text.lower()
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokenized = tokenizer.tokenize(lower_text)
    lemmatized_output = ' '.join([stemmer.stem(w) for w in tokenized])
    return lemmatized_output

# process user imput value for tokenization
processed_text = text_stemmer(input_text)

# extracting the features from the lemmatized text
vectorized_text = vectorizer.transform([processed_text])

# predict the classification
predicted_classification = model.predict(vectorized_text)

# function to classify the text
def text_classification(predicted):
    if predicted_classification == 'Business':
        print('The given text represents to category: Business.')
    elif predicted_classification == 'Health':
        print('The given text represents to category: Health.')
    elif predicted_classification == 'Politics':
        print('The given text represents to category: Politics.')

# get the classification
classified_text = text_classification(predicted_classification)

The given text represents to category: Business.


In [16]:
pickle.dump(model, open('classifier.sav', 'wb'))
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))