In [3]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/626.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/626.3 kB ? eta -:--:--
   ------------------------------- ------ 524.3/626.3 kB 799.2 kB/s eta 0:00:01
   -------------------------------------- 626.3/626.3 kB 831.5 kB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob
import re
import string

In [7]:
# Load the dataset
data = pd.read_csv("blogs.csv")

In [11]:
# Display dataset information
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [13]:
# Check for missing values
data.isnull().sum()

Data      0
Labels    0
dtype: int64

In [15]:
# Drop rows with missing values, if any
data.dropna(inplace=True)


In [17]:
# Preprocess the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

data['Cleaned_Text'] = data['Data'].apply(clean_text)


In [19]:
# Tokenization, stopwords removal, and TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(data['Cleaned_Text'])
y = data['Labels']


In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Instantiate the model
model = MultinomialNB()

In [25]:
# Train the model
model.fit(X_train, y_train)


In [27]:
# Make predictions
y_pred = model.predict(X_test)

In [31]:
# Sentiment Analysis

def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'


In [33]:
# Apply sentiment analysis
data['Sentiment'] = data['Data'].apply(get_sentiment)

In [35]:
# Sentiment distribution
sentiment_dist = data.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print("Sentiment distribution across categories:\n", sentiment_dist)

Sentiment distribution across categories:
 Sentiment                 Negative  Positive
Labels                                      
alt.atheism                     23        77
comp.graphics                   24        76
comp.os.ms-windows.misc         22        78
comp.sys.ibm.pc.hardware        20        80
comp.sys.mac.hardware           24        76
comp.windows.x                  27        73
misc.forsale                    16        84
rec.autos                       17        83
rec.motorcycles                 26        74
rec.sport.baseball              29        71
rec.sport.hockey                34        66
sci.crypt                       19        81
sci.electronics                 19        81
sci.med                         29        71
sci.space                       27        73
soc.religion.christian          13        87
talk.politics.guns              30        70
talk.politics.mideast           22        78
talk.politics.misc              22        78
talk.religio

In [37]:
#Evaluation

# Classification metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 0.85
Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.56      0.83      0.67        18
           comp.graphics       0.71      0.83      0.77        18
 comp.os.ms-windows.misc       0.91      0.91      0.91        22
comp.sys.ibm.pc.hardware       0.84      0.84      0.84        25
   comp.sys.mac.hardware       0.87      0.95      0.91        21
          comp.windows.x       0.95      0.84      0.89        25
            misc.forsale       1.00      0.78      0.88        18
               rec.autos       0.90      1.00      0.95        18
         rec.motorcycles       1.00      0.94      0.97        16
      rec.sport.baseball       0.80      0.89      0.84        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.95      1.00      0.97        19
         sci.electronics       0.65      0.69      0.67        16
                 sci.med       0.88 

In [45]:
# Save the processed data and sentiment analysis results to a CSV
output_file = "processed_blog_data.csv"
data.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

Processed data saved to processed_blog_data.csv
