In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import swifter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('blogs_categories.csv', index_col=0)
df

Unnamed: 0,Data,Labels
0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism
1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism
...,...,...
19992,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19993,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...,talk.religion.misc
19994,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
19995,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [3]:
df.describe()

Unnamed: 0,Data,Labels
count,19997,19997
unique,19466,20
top,Xref: cantaloupe.srv.cs.cmu.edu talk.politics....,alt.atheism
freq,4,1000


In [4]:
df['Labels'].value_counts()

Labels
alt.atheism                 1000
comp.graphics               1000
talk.politics.misc          1000
talk.politics.mideast       1000
talk.politics.guns          1000
sci.space                   1000
sci.med                     1000
sci.electronics             1000
sci.crypt                   1000
rec.sport.hockey            1000
rec.sport.baseball          1000
rec.motorcycles             1000
rec.autos                   1000
misc.forsale                1000
comp.windows.x              1000
comp.sys.mac.hardware       1000
comp.sys.ibm.pc.hardware    1000
comp.os.ms-windows.misc     1000
talk.religion.misc          1000
soc.religion.christian       997
Name: count, dtype: int64

In [5]:
# defining language for stop words 
stop_words = set(stopwords.words('english'))

In [6]:
# Exploratory Text Data Analysis
# Defining a function to preprocess data 
def clean_text(text):
    text = text.lower() # to convert all words into lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)  #  Words tokanization
    tokens = [word for word in tokens if word not in stop_words] 
    return ' '.join(tokens)

In [7]:
# Creating a column into dataframe containing preprocessed (cleaned) data
df['Cleaned_Data'] = df['Data'].swifter.apply(clean_text)

Pandas Apply:   0%|          | 0/19997 [00:00<?, ?it/s]

In [8]:
df.head()

Unnamed: 0,Data,Labels,Cleaned_Data
0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...,alt.atheism,xref cantaloupesrvcscmuedu altatheism49960 alt...
1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism,xref cantaloupesrvcscmuedu altatheism51060 alt...
2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism,newsgroups altatheism path cantaloupesrvcscmue...
3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism,xref cantaloupesrvcscmuedu altatheism51120 alt...
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...,alt.atheism,xref cantaloupesrvcscmuedu altatheism51121 soc...


In [9]:
# checking for null values
df.isna().sum()

Data            0
Labels          0
Cleaned_Data    0
dtype: int64

In [10]:
# Using Tf-IDF Vectorization to convert text data into numerical.
vectorizer = TfidfVectorizer(max_features=1000)
# Defining X and Y (features and Labels)
x = vectorizer.fit_transform(df['Cleaned_Data'])
y = df['Labels']

In [11]:
# Splitting data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
# Training a Naive Bayes Classifier algorithm
nb = MultinomialNB()
nb.fit(xtrain, ytrain) # fitting training data 

In [13]:
# Getting predictions
ypred = nb.predict(xtest)
ypred

array(['rec.sport.hockey', 'comp.sys.mac.hardware',
       'comp.sys.ibm.pc.hardware', ..., 'rec.motorcycles',
       'talk.politics.guns', 'comp.windows.x'], dtype='<U24')

In [14]:
# Creatig a function to check sentiment of text
def get_sentiment(text):
    analysis = TextBlob(text)
    # TextBlob is uesd to determine the sentiment polarity of the text and categorizes it as positive, neutral, or negative.
    
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

In [15]:
# Applying sentiment analysis function on data to see sentiment 
df['Sentiment'] = df['Data'].swifter.apply(get_sentiment)

Pandas Apply:   0%|          | 0/19997 [00:00<?, ?it/s]

In [16]:
# Analyzing distribution of sentiment
sentiment_distribution = df.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack()
sentiment_distribution

Sentiment,negative,neutral,positive
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alt.atheism,0.199,,0.801
comp.graphics,0.25,0.001,0.749
comp.os.ms-windows.misc,0.236,,0.764
comp.sys.ibm.pc.hardware,0.238,0.001,0.761
comp.sys.mac.hardware,0.242,,0.758
comp.windows.x,0.29,0.002,0.708
misc.forsale,0.229,,0.771
rec.autos,0.201,,0.799
rec.motorcycles,0.262,,0.738
rec.sport.baseball,0.249,,0.751


In [17]:
# Replace NaN values with 0
sentiment_distribution = sentiment_distribution.fillna(0)
sentiment_distribution

Sentiment,negative,neutral,positive
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alt.atheism,0.199,0.0,0.801
comp.graphics,0.25,0.001,0.749
comp.os.ms-windows.misc,0.236,0.0,0.764
comp.sys.ibm.pc.hardware,0.238,0.001,0.761
comp.sys.mac.hardware,0.242,0.0,0.758
comp.windows.x,0.29,0.002,0.708
misc.forsale,0.229,0.0,0.771
rec.autos,0.201,0.0,0.799
rec.motorcycles,0.262,0.0,0.738
rec.sport.baseball,0.249,0.0,0.751


In [18]:
# Model evaluation
# Checking accuracy, precision, recall and F1 sciore
accuracy = accuracy_score(ytest, ypred)
precision = precision_score(ytest, ypred, average='weighted')
recall = recall_score(ytest, ypred, average='weighted')
f1 = f1_score(ytest, ypred, average='weighted')

In [19]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Accuracy: 0.8365
Precision: 0.8352723181582156
Recall: 0.8365
F1-Score: 0.8343175692330548


## Conclusion
The Naive Bayes classifier achieved an accuracy of 84%

Result of Sentiment Analysis-

-alt.atheism = 80.1% positive.

-comp.graphics = 25% negative, 74.9% positive.

-comp.os.ms-windows.misc = 76.4% positive.

-soc.religion.christian = Highest positive sentiment at 82.85%.

-talk.politics.guns = Higher negative sentiment at 28.3%, 71.7% positive.