In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv("blogs.csv")

In [3]:
data.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


# 1. Data Exploration and Preprocessing

In [4]:
#checking for null values
data.isnull().sum()

Data      0
Labels    0
dtype: int64

In [5]:
#there are no null value or missing values

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [7]:
#checking the distribution of categories in target variable
data['Labels'].value_counts()

alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: Labels, dtype: int64

In [8]:
#all are equally disrtibuted there is no imbalance in data

In [9]:
#Converting text to lowercase
data['Data'] = data['Data'].str.lower()

In [10]:
#removing punctuation from the 'Data' column
import string
data['Data'] = data['Data'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [11]:
#removing stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
data['Data'] = data['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in ENGLISH_STOP_WORDS]))

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

#Initializing CountVectorizer for tokenization and transformation
vectorizer = CountVectorizer()

#Tokenizing the text and transforming
X = vectorizer.fit_transform(data['Data'])


In [13]:
#feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

#initializing the TF IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

#Transforming the text into a TF IDF 
X_tfidf = tfidf_vectorizer.fit_transform(data['Data'])


In [14]:
#Checking unique categories
data['Labels'].unique()

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype=object)

In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Labels'])

In [16]:
#now our data is ready

# 2. Naive Bayes Model for Text Classification

In [17]:
from sklearn.model_selection import train_test_split

#Splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [18]:
#we are using multinomial naive bias as it works weel with tf idf
from sklearn.naive_bayes import MultinomialNB

In [19]:
nb_classifier = MultinomialNB()
# Train the model on the training data
nb_classifier.fit(x_train, y_train)

In [20]:
#Predicting on the test set
y_pred = nb_classifier.predict(x_test)

In [21]:
y_pred

array([18,  3, 13,  9, 12, 12,  9, 17,  0, 13,  0, 12, 11,  9,  3,  2,  7,
        1, 16, 18,  4,  0,  0, 10,  0, 11, 11,  9,  7,  0,  9, 10,  5,  9,
       10,  4, 13, 12, 10,  2, 12, 15,  2,  9, 15,  8,  5,  8,  0, 18, 15,
        2, 14,  2,  9, 17, 12, 16, 11,  3, 14,  1, 16, 10, 18, 18,  9, 15,
        1, 14, 14,  3, 13, 10,  8,  3, 16, 18, 12, 18,  0,  8, 14, 15, 18,
        9, 17,  4,  1, 16, 15, 17,  2,  2, 18, 18,  1, 12, 15, 18,  9,  9,
        4, 13,  2,  8,  6,  2,  8,  5,  7, 14, 10, 17,  7, 11, 11,  9,  3,
       12, 18, 10, 11,  4,  3, 15,  4,  8, 11,  3, 19, 18,  7,  5,  8,  9,
        0,  0, 16, 15, 13,  3, 18,  8, 15,  5,  6,  4,  2,  1, 16,  9,  6,
       11,  3,  6,  5, 16,  5, 16,  6,  8,  1, 11, 19, 14, 17, 12,  4, 17,
       11,  6,  0, 18,  4, 13,  7, 13,  8, 15, 12,  7, 18, 11,  8,  7,  0,
       16, 11, 12,  7,  2, 13,  2,  0,  4, 11,  8,  7, 15, 18,  5,  1,  7,
        4,  2, 14,  1, 10,  0, 17,  0,  2,  1,  7, 18, 11,  3,  4, 12,  9,
        1,  5, 11, 18, 19

In [22]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
# Classification Report (includes precision, recall, F1-score for each class)
print(classification_report(y_test, y_pred))
# Confusion Matrix

print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.52      0.89      0.65        18
           1       0.65      0.83      0.73        18
           2       0.95      0.82      0.88        22
           3       0.95      0.76      0.84        25
           4       0.83      0.95      0.89        21
           5       1.00      0.84      0.91        25
           6       0.91      0.56      0.69        18
           7       0.84      0.89      0.86        18
           8       0.88      0.88      0.88        16
           9       0.74      0.94      0.83        18
          10       0.88      1.00      0.94        15
          11       0.90      1.00      0.95        19
          12       0.59      0.81      0.68        16
          13       0.94      0.88      0.91        17
          14       1.00      0.86      0.92        21
          15       0.81      0.96      0.88        23
          16       0.95      0.71      0.82        28
          17       0.95    

In [23]:
#our model is showing the accuracy of 82%

# 3. Sentiment Analysis

In [24]:
from textblob import TextBlob

In [25]:
#TextBlob gives a polarity score between -1 (very negative) and 1 (very positive). 
#we can use this score to classify sentiment.

In [26]:
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

In [27]:
#applying them to blog posts
data['Sentiment'] = data['Data'].apply(get_sentiment)
data[['Data', 'Sentiment']].head()

Unnamed: 0,Data,Sentiment
0,path cantaloupesrvcscmuedumagnesiumclubcccmued...,Neutral
1,newsgroups altatheism path cantaloupesrvcscmue...,Neutral
2,path cantaloupesrvcscmuedudasnewsharvardedunoc...,Neutral
3,path cantaloupesrvcscmuedumagnesiumclubcccmued...,Neutral
4,xref cantaloupesrvcscmuedu altatheism53485 tal...,Positive


In [28]:
#Examine the distribution of sentiments across different categories and summarize your findings.


In [29]:
#cross tabulating sentiments with categories
sentiment_by_category = pd.crosstab(data['Labels'], data['Sentiment'])

#displaying the table
print(sentiment_by_category)

Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                     14       59        27
comp.graphics                    9       54        37
comp.os.ms-windows.misc         11       46        43
comp.sys.ibm.pc.hardware         6       49        45
comp.sys.mac.hardware           10       51        39
comp.windows.x                  10       59        31
misc.forsale                     9       34        57
rec.autos                       11       48        41
rec.motorcycles                 14       46        40
rec.sport.baseball              13       53        34
rec.sport.hockey                20       53        27
sci.crypt                        6       59        35
sci.electronics                  5       60        35
sci.med                         17       57        26
sci.space                        8       53        39
soc.religion.christian           4       65        31
talk.politics.guns          

In [30]:
#Neutral Sentiment Dominates Across All Categories

#In nearly every category, neutral sentiment is the most prevalent, suggesting that many blog posts 
#take on an informational or discussion-oriented tone.

#actegories with Higher Positive Sentiment

#misc.forsale shows a higher count of positive sentiment (57), 
#possibly due to persuasive and enthusiastic language used in sales listings.

#categories with More Negative Sentiment

#talk.politics.guns has the highest number of negative posts (23), 
#which may reflect emotionally charged opinions or debates.


In [31]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#Predictions
y_pred = nb_classifier.predict(x_test)

#Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8175
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.89      0.65        18
           1       0.65      0.83      0.73        18
           2       0.95      0.82      0.88        22
           3       0.95      0.76      0.84        25
           4       0.83      0.95      0.89        21
           5       1.00      0.84      0.91        25
           6       0.91      0.56      0.69        18
           7       0.84      0.89      0.86        18
           8       0.88      0.88      0.88        16
           9       0.74      0.94      0.83        18
          10       0.88      1.00      0.94        15
          11       0.90      1.00      0.95        19
          12       0.59      0.81      0.68        16
          13       0.94      0.88      0.91        17
          14       1.00      0.86      0.92        21
          15       0.81      0.96      0.88        23
          16       0.95      0.71      0

In [32]:
#our Naive Bayes classifier achieved an accuracy of 81.75%

# Classification Metrics:

# Macro Average F1 Score: 0.81
# This shows balanced performance across all classes.

# Weighted Average F1-Score: 0.81
# This takes class imbalance into account and suggests the model performed consistently across the dataset.

# High performance was seen in categories like:
# comp.windows.x: F1-score of 0.91
# sci.crypt: F1-score of 0.95
# talk.politics.mideast: F1-score of 0.95

# Challenging categories:
# talk.religion.misc: F1-score of 0.27
# Precision and recall were low (0.67 and 0.17), indicating the model struggled with correctly identifying or predicting this class.
# comp.graphics: F1-score of 0.65
# Despite high recall (0.89), precision was low (0.52), suggesting many false positives.