In [None]:
 import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import string
from sklearn.model_selection import GridSearchCV

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
lemmatizer=WordNetLemmatizer()
stop_words=set(stopwords.words('english'))

#DATASET

In [None]:
newsgroups=fetch_20newsgroups(subset='all',shuffle=True,random_state=42)
print(f"Categories:{newsgroups.target_names}")

Categories:['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [None]:
df=pd.DataFrame(list(zip(newsgroups.data,newsgroups.target)),columns=['text','target'])
df.head(100)

Unnamed: 0,text,target
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,10
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,3
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,17
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,3
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,4
...,...,...
95,From: jcmorris@mbunix.mitre.org (Morris)\nSubj...,3
96,From: shiva@leland.Stanford.EDU (Matt Jacobson...,2
97,From: tmenner@sei.cmu.edu (Thomas Menner)\nSub...,10
98,From: scatt@apg.andersen.com (Scott Cattanach)...,16


In [None]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'(.)\1{2,}',r'\1',text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
#df['text']=df['text'].apply(clean_text)

In [None]:
df['text']=df['text'].apply(clean_text)

In [None]:
df.info()
df.head(100)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18846 entries, 0 to 18845
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    18846 non-null  object
 1   target  18846 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 294.6+ KB


Unnamed: 0,text,target
0,from mamatha devineni ratnam mr47cmuedu subjec...,10
1,from mblawsonecnuoknoredu matthew b lawson sub...,3
2,from hilmiersuse hilmi eren subject re armenia...,17
3,from guydibmcom guy dawson subject re ide vs s...,3
4,from alexander samuel mcdiarmid am2ocmuedu sub...,4
...,...,...
95,from jcmorrismitreorg morris subject re soundb...,3
96,from shivastanfordedu matt jacobson subject wi...,2
97,from tmennercmuedu thomas menner subject hocke...,10
98,from scattandersencom scott cattanach subject ...,16


In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [
        lemmatizer.lemmatize(word) for word in tokens
        if word not in string.punctuation and word not in stop_words
]
    return ' '.join(lemmatized_tokens)

In [None]:
preprocessed_data=[preprocess_text(doc) for doc in newsgroups.data]

In [None]:
df.head(100)

Unnamed: 0,text,target
0,from mamatha devineni ratnam mr47cmuedu subjec...,10
1,from mblawsonecnuoknoredu matthew b lawson sub...,3
2,from hilmiersuse hilmi eren subject re armenia...,17
3,from guydibmcom guy dawson subject re ide vs s...,3
4,from alexander samuel mcdiarmid am2ocmuedu sub...,4
...,...,...
95,from jcmorrismitreorg morris subject re soundb...,3
96,from shivastanfordedu matt jacobson subject wi...,2
97,from tmennercmuedu thomas menner subject hocke...,10
98,from scattandersencom scott cattanach subject ...,16


#TRAIN,TEXT,VALIDATE DATASET

In [None]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, newsgroups.target, test_size=0.2, random_state=42)


In [None]:
tfidf=TfidfVectorizer(max_df=0.7)
X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

In [None]:
alpha_values=[0.001,0.01,0.1,1,5,10,100]

In [None]:
param_grid = {'alpha': alpha_values}

In [None]:
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

In [None]:
best_alpha = grid_search.best_params_['alpha']
final_model = MultinomialNB(alpha=best_alpha)
final_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred=final_model.predict(X_test_tfidf)

In [None]:
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred,target_names=newsgroups.target_names))

Accuracy: 0.9145888594164456
Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.90      0.93      0.92       151
           comp.graphics       0.80      0.87      0.84       202
 comp.os.ms-windows.misc       0.87      0.81      0.84       195
comp.sys.ibm.pc.hardware       0.76      0.83      0.79       183
   comp.sys.mac.hardware       0.89      0.89      0.89       205
          comp.windows.x       0.90      0.88      0.89       215
            misc.forsale       0.91      0.80      0.85       193
               rec.autos       0.91      0.95      0.93       196
         rec.motorcycles       0.96      0.96      0.96       168
      rec.sport.baseball       0.99      0.98      0.98       211
        rec.sport.hockey       0.95      0.97      0.96       198
               sci.crypt       0.97      0.95      0.96       201
         sci.electronics       0.89      0.87      0.88       202
                 sci.m