In [5]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string # special operations on strings
import spacy # language models
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
import warnings

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# Download the language model from spacy.
# This is because, if we want to apply lemmatization, 1st we need to know the language of words that we are working with.
# As, if we know the language, we can trim the word accordingly
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [23]:
# Importing functions from the above library
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # To build naive bayes model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from wordcloud import WordCloud
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer

In [8]:
data=pd.read_csv("blogs_categories.csv")
data

Unnamed: 0.1,Unnamed: 0,Data,Labels
0,0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism
1,1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
2,2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
3,3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
4,4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
...,...,...,...
19992,19992,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19993,19993,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19994,19994,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
19995,19995,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [9]:
# To understand the data type and null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19997 entries, 0 to 19996
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  19997 non-null  int64 
 1   Data        19997 non-null  object
 2   Labels      19997 non-null  object
dtypes: int64(1), object(2)
memory usage: 468.8+ KB


In [12]:
data['Labels'].value_counts()

Unnamed: 0_level_0,count
Labels,Unnamed: 1_level_1
alt.atheism,1000
comp.graphics,1000
talk.politics.misc,1000
talk.politics.mideast,1000
talk.politics.guns,1000
sci.space,1000
sci.med,1000
sci.electronics,1000
sci.crypt,1000
rec.sport.hockey,1000


In [14]:
# To find out number of null values
data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
Data,0
Labels,0


In [15]:
# Dropping the unnamed column
data.drop(['Unnamed: 0'],axis=1,inplace=True)

In [16]:
data

Unnamed: 0,Data,Labels
0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism
1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
...,...,...
19992,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19993,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19994,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
19995,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [17]:
# Function to clean text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text (breaks it into individual words)
    stop_words = set(stopwords.words('english'))  # Load stopwords (common words like "the", "and", etc., which don’t add much meaning to the analysis)
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(filtered_tokens)

# Apply preprocessing
data['cleaned_text'] = data['Data'].apply(preprocess_text)
data['cleaned_text']

Unnamed: 0,cleaned_text
0,xref cantaloupesrvcscmuedu altatheism49960 alt...
1,xref cantaloupesrvcscmuedu altatheism51060 alt...
2,newsgroups altatheism path cantaloupesrvcscmue...
3,xref cantaloupesrvcscmuedu altatheism51120 alt...
4,xref cantaloupesrvcscmuedu altatheism51121 soc...
...,...
19992,xref cantaloupesrvcscmuedu altatheism54482 tal...
19993,xref cantaloupesrvcscmuedu altatheism54485 tal...
19994,xref cantaloupesrvcscmuedu talkreligionmisc845...
19995,xref cantaloupesrvcscmuedu talkreligionmisc845...


In [19]:
# Performing feature extraction using TF-IDF vectorizer and also splitting the data into target variable and independant variable
tfidf = TfidfVectorizer()  # Initialize the TF-IDF vectorizer
X = tfidf.fit_transform(data['cleaned_text'])  # Features (tokens)
y = data['Labels']  # Labels (categories)

# **02. Building the model**

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  # Splitting the data into training and testing sets

In [21]:
# Building the model
nb_model = MultinomialNB()  # Initialize the Naive Bayes model
nb_model.fit(X_train, y_train)  # Train the model on the training data

In [22]:
y_pred = nb_model.predict(X_test)  # Make predictions on the test data

# **03. Sentiment Analysis**

In [24]:
# Initializing the Vader sentiment analyser
sia = SentimentIntensityAnalyzer()

# Function to get sentiment polarity
def get_sentiment(text):
    sentiment = sia.polarity_scores(text)
    if sentiment['compound'] > 0.05:
        return 'Positive'
    elif sentiment['compound'] < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Applying sentiment analysis to the 'cleaned_text' column
data['sentiment'] = data['cleaned_text'].apply(get_sentiment)

In [25]:
# Print Sentiment distribution
print(data['sentiment'].value_counts())

sentiment
Positive    13647
Negative     5596
Neutral       754
Name: count, dtype: int64


In [27]:
# Creating a dataframe of the sentiment distribution and printing it
sentiment_distribution = pd.crosstab(data['Labels'], data['sentiment'])
print(sentiment_distribution)

sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                    345       19       636
comp.graphics                   87       58       855
comp.os.ms-windows.misc        152       52       796
comp.sys.ibm.pc.hardware       170       43       787
comp.sys.mac.hardware          194       55       751
comp.windows.x                 199       52       749
misc.forsale                   125       75       800
rec.autos                      274       48       678
rec.motorcycles                285       35       680
rec.sport.baseball             201       55       744
rec.sport.hockey               243       31       726
sci.crypt                      258       29       713
sci.electronics                140       45       815
sci.med                        302       41       657
sci.space                      240       32       728
soc.religion.christian         224        7       766
talk.politics.guns          

# **04. Evaluation of the model**

In [28]:
print("Classification Report:\n", classification_report(y_test, y_pred))  # Print classification report

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.72      0.82      0.76       274
           comp.graphics       0.87      0.90      0.88       290
 comp.os.ms-windows.misc       0.93      0.87      0.90       320
comp.sys.ibm.pc.hardware       0.81      0.87      0.84       304
   comp.sys.mac.hardware       0.89      0.94      0.91       296
          comp.windows.x       0.97      0.92      0.95       293
            misc.forsale       0.92      0.77      0.84       297
               rec.autos       0.92      0.95      0.94       306
         rec.motorcycles       0.99      0.97      0.98       317
      rec.sport.baseball       1.00      0.99      0.99       290
        rec.sport.hockey       0.98      0.99      0.98       301
               sci.crypt       0.91      0.98      0.94       295
         sci.electronics       0.92      0.87      0.89       318
                 sci.med       1.00      0.92      

In [31]:
# Calculate accurarcy, precision, recall & f1 score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8965
Precision: 0.8972217134858068
Recall: 0.8965
F1 Score: 0.8952977531488523


In conclusion, the Naive Bayes classifier, with proper preprocessing and feature extraction, can effectively categorize blog posts. Performance evaluation using metrics such as accuracy, precision, recall, and F1-score provides a comprehensive view of the model’s effectiveness. Challenges such as class imbalance and text preprocessing need careful handling to ensure robust performance.

Sentiment analysis adds another layer of understanding, revealing the emotional tone of the blog posts. These insights can be crucial for content strategy, reader engagement, and marketing efforts.
We can see that on an average the positive sentiment on the blog was far greater than that of negative and neutral.

By combining text classification with sentiment analysis, you gain a powerful toolkit for extracting and leveraging insights from textual data, enhancing both the analytical and strategic capabilities of your organization.