#### Text Classification Using Naive Bayes And Sentiment Analysis On Blog Post

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Data Exploration and Preprocessing

In [24]:
import pandas as pd

In [25]:
# load the dataset
data = pd.read_csv('blogs.csv')
data.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [26]:
data.shape

(2000, 2)

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [28]:
data.isnull().sum()

Data      0
Labels    0
dtype: int64

In [29]:
data['Labels'].value_counts()

alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: Labels, dtype: int64

In [30]:
# Define a function to clean and preprocess the text
import string
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    print(text)
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words('english'))  # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [31]:
# Removing Punctuation
def remove_punctuation(text):
  for punctuation in string.punctuation:
    text = text.replace(punctuation, ' ')
  return text

data["Data"]=data["Data"].apply(remove_punctuation)    # data after remove puncuation
data.head()

Unnamed: 0,Data,Labels
0,Path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
1,Newsgroups alt atheism\nPath cantaloupe srv ...,alt.atheism
2,Path cantaloupe srv cs cmu edu das news harva...,alt.atheism
3,Path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
4,Xref cantaloupe srv cs cmu edu alt atheism 53...,alt.atheism


In [32]:
# tokenization
def preprocess_text(text):            # function
# Tokenization
    tokens = word_tokenize(data["Data"])
    print(tokens)
    print(len(tokens))

In [33]:
# converting to lowercase
data["Data"] = data["Data"].apply(lambda x: x.lower())
data.head()

Unnamed: 0,Data,Labels
0,path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
1,newsgroups alt atheism\npath cantaloupe srv ...,alt.atheism
2,path cantaloupe srv cs cmu edu das news harva...,alt.atheism
3,path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
4,xref cantaloupe srv cs cmu edu alt atheism 53...,alt.atheism


In [34]:
# Removing the stopwords
stopwords = nltk.corpus.stopwords.words("english")
data["Data"] = data["Data"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

data.head()

Unnamed: 0,Data,Labels
0,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism
1,newsgroups alt atheism path cantaloupe srv cs ...,alt.atheism
2,path cantaloupe srv cs cmu edu das news harvar...,alt.atheism
3,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism
4,xref cantaloupe srv cs cmu edu alt atheism 534...,alt.atheism


## Naive Bayes Model for Text Classification

In [35]:
from sklearn.model_selection import train_test_split

# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['Data'])
y = data['Labels']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [36]:
X_train.shape, X_test.shape

((1600, 50578), (400, 50578))

In [37]:
# predictions on training
y_predtrain= nb_classifier.predict(X_train)       
print(y_predtrain)

['rec.sport.baseball' 'comp.os.ms-windows.misc' 'rec.motorcycles' ...
 'rec.motorcycles' 'sci.space' 'sci.crypt']


In [38]:
# Make predictions on test
y_predtest = nb_classifier.predict(X_test)
print(y_predtest)

['talk.politics.misc' 'comp.sys.ibm.pc.hardware' 'sci.med'
 'rec.sport.baseball' 'comp.sys.ibm.pc.hardware' 'sci.electronics'
 'rec.sport.baseball' 'talk.politics.mideast' 'alt.atheism' 'sci.med'
 'alt.atheism' 'sci.med' 'sci.crypt' 'rec.sport.baseball'
 'comp.sys.ibm.pc.hardware' 'comp.os.ms-windows.misc' 'rec.autos'
 'comp.graphics' 'talk.politics.guns' 'talk.politics.misc' 'misc.forsale'
 'talk.politics.misc' 'talk.religion.misc' 'rec.sport.hockey'
 'alt.atheism' 'sci.crypt' 'sci.crypt' 'rec.sport.baseball' 'rec.autos'
 'alt.atheism' 'misc.forsale' 'rec.sport.hockey' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.hockey' 'comp.sys.mac.hardware' 'sci.med'
 'rec.sport.hockey' 'rec.sport.hockey' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'talk.religion.misc' 'comp.os.ms-windows.misc'
 'comp.graphics' 'soc.religion.christian' 'rec.motorcycles' 'sci.crypt'
 'rec.motorcycles' 'alt.atheism' 'talk.politics.guns'
 'soc.religion.christian' 'sci.crypt' 'sci.space'
 'comp.os.ms-wind

# Sentiment Analysis

In [39]:
#!pip install textblob

In [40]:
from textblob import TextBlob

# function for sentiment analysis
def get_sentiment(text):                         
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

data['Sentiment'] = data['Data'].apply(get_sentiment)   # added column in data 
data.head()

Unnamed: 0,Data,Labels,Sentiment
0,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism,Positive
1,newsgroups alt atheism path cantaloupe srv cs ...,alt.atheism,Negative
2,path cantaloupe srv cs cmu edu das news harvar...,alt.atheism,Positive
3,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism,Positive
4,xref cantaloupe srv cs cmu edu alt atheism 534...,alt.atheism,Positive


In [41]:
data['Sentiment'].value_counts()

Positive    1497
Negative     500
Neutral        3
Name: Sentiment, dtype: int64

# Evaluation

In [43]:
# Evaluate Naive Bayes classifier
accuracy = accuracy_score(y_test, y_predtest)                  # accuracy
precision = precision_score(y_test, y_predtest, average='macro')
recall = recall_score(y_test, y_predtest, average='macro')
f1 = f1_score(y_test, y_predtest, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7875
Precision: 0.8015665398581167
Recall: 0.7982219039130805
F1 Score: 0.7789734897625344


In [48]:
from sklearn.metrics import classification_report,confusion_matrix

# classification report on test
print(classification_report(y_test, y_predtest)) 

                          precision    recall  f1-score   support

             alt.atheism       0.60      0.83      0.70        18
           comp.graphics       0.72      0.72      0.72        18
 comp.os.ms-windows.misc       0.75      0.95      0.84        22
comp.sys.ibm.pc.hardware       0.75      0.84      0.79        25
   comp.sys.mac.hardware       0.88      0.67      0.76        21
          comp.windows.x       1.00      0.28      0.44        25
            misc.forsale       0.74      0.78      0.76        18
               rec.autos       0.77      0.94      0.85        18
         rec.motorcycles       0.81      0.81      0.81        16
      rec.sport.baseball       0.83      0.83      0.83        18
        rec.sport.hockey       0.65      1.00      0.79        15
               sci.crypt       0.68      1.00      0.81        19
         sci.electronics       0.75      0.56      0.64        16
                 sci.med       0.88      0.88      0.88        17
         

In [46]:
# confusion matrix on test
print(confusion_matrix(y_test, y_predtest))

[[15  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  2]
 [ 0 13  0  1  0  0  2  0  0  1  0  1  0  0  0  0  0  0  0  0]
 [ 0  0 21  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  1 21  0  0  0  0  1  0  0  1  1  0  0  0  0  0  0  0]
 [ 0  1  0  2 14  0  1  0  0  0  1  1  0  0  0  0  0  1  0  0]
 [ 0  2  6  1  0  7  1  0  2  1  1  3  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0 14  3  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 17  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  1 13  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 15  3  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 19  0  0  0  0  0  0  0  0]
 [ 0  1  0  2  1  0  0  1  0  0  1  0  9  1  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  1  0  0  0 15  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  1  0  0  0 19  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 23  0  0

In [52]:
# classification report on train
print(classification_report(y_train, y_predtrain))

                          precision    recall  f1-score   support

             alt.atheism       0.90      0.99      0.94        82
           comp.graphics       0.99      0.99      0.99        82
 comp.os.ms-windows.misc       0.99      1.00      0.99        78
comp.sys.ibm.pc.hardware       0.99      1.00      0.99        75
   comp.sys.mac.hardware       1.00      1.00      1.00        79
          comp.windows.x       1.00      0.99      0.99        75
            misc.forsale       1.00      1.00      1.00        82
               rec.autos       1.00      1.00      1.00        82
         rec.motorcycles       1.00      1.00      1.00        84
      rec.sport.baseball       1.00      0.99      0.99        82
        rec.sport.hockey       0.99      1.00      0.99        85
               sci.crypt       1.00      1.00      1.00        81
         sci.electronics       1.00      0.99      0.99        84
                 sci.med       1.00      0.99      0.99        83
         

In [53]:
# confusion matrix on train
print(confusion_matrix(y_train, y_predtrain))

[[81  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
 [ 0 81  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 78  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 75  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 79  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  0  0 74  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 82  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 82  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 84  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 81  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 85  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 81  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 83  0  1  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  0 82  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 79  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0

The class comp.windows.x had a high precision (1.00) but a very low recall (0.28), indicating that the model was very precise when it did make predictions for this class, but it missed many instances.
The class rec.sport.hockey and sci.crypt had perfect recall (1.00) but lower precision, meaning all instances of these classes were correctly identified, but there were some false positives.
Overall accuracy is 0.79, indicating that the model correctly classified 79% of the instances in the test set.

The Naive Bayes classifier performed moderately well on the 20 Newsgroups dataset with an overall accuracy of 0.79 on the test set. While some categories achieved high precision and recall, others struggled, reflecting the inherent challenges in text classification tasks.4