In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string
from sklearn.datasets import fetch_20newsgroups

In [2]:
# Load the 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Create a DataFrame
df = pd.DataFrame({'text': newsgroups_data.data, 'label': newsgroups_data.target})
df['label'] = df['label'].apply(lambda x: newsgroups_data.target_names[x])

In [3]:
df.head()

Unnamed: 0,text,label
0,\n\nI am sure some bashers of Pens fans are pr...,rec.sport.hockey
1,My brother is in the market for a high-perform...,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,comp.sys.mac.hardware


In [4]:
df.shape

(18846, 2)

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [7]:
# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert all characters to lowercase and reme punctuation
    text = ''.join([char.lower() for char in text if char not in string.punctuation and not char.isdigit()])

    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Lemmatize each word to its base form
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Remove stopwords (common words that typically don't contribute much to the meaning)
    tokens = [word for word in tokens if word not in stop_words]

    # Perform part-of-speech tagging and keep only the base form of each word
    pos_tags = pos_tag(tokens)
    text = ' '.join([pos[0] for pos in pos_tags])

    return text


df['processed_text'] = df['text'].apply(preprocess_text)

In [8]:
df.head()

Unnamed: 0,text,label,processed_text
0,\n\nI am sure some bashers of Pens fans are pr...,rec.sport.hockey,sure bashers pen fan pretty confused lack kind...
1,My brother is in the market for a high-perform...,comp.sys.ibm.pc.hardware,brother market highperformance video card supp...
2,\n\n\n\n\tFinally you said what you dream abou...,talk.politics.mideast,finally said dream mediterranean wa new area g...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,comp.sys.ibm.pc.hardware,think scsi card dma transfer disk scsi card dm...
4,1) I have an old Jasmine drive which I cann...,comp.sys.mac.hardware,old jasmine drive use new system understanding...


In [9]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.2)

In [10]:
# converting text documents into a numerical feature matrix
#TF-IDF matrix, each row corresponds to a document, and each column corresponds to a unique word
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [11]:
# Train the Model Naive Bayes (NB) classifier.used for text classification tasks
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Evaluate the Model
predictions = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')


Accuracy: 0.7047745358090186
