# TEXT CLASSIFICATION USING NAIVE BAYES AND SENTIMENT ANALYSIS ON BLOG POSTS

## 1. Data Exploration and Preprocessing

In [179]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Load the dataset 

In [182]:
df = pd.read_csv('blogs.csv')

In [184]:
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [186]:
df['Labels'].value_counts()

Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: count, dtype: int64

### Preprocess the data by cleaning the text (removing punctuation, converting to lowercase, etc.), tokenizing, and removing stopwords.

In [189]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [191]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_data'] = df['Data'].apply(preprocess_text)


In [192]:
df

Unnamed: 0,Data,Labels,cleaned_data
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism,newsgroups altatheism path cantaloupesrvcscmue...
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism,path cantaloupesrvcscmuedudasnewsharvardedunoc...
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism,xref cantaloupesrvcscmuedu altatheism53485 tal...
...,...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc,xref cantaloupesrvcscmuedu talkabortion120945 ...
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc,xref cantaloupesrvcscmuedu talkreligionmisc837...
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc,xref cantaloupesrvcscmuedu talkorigins41030 ta...
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc,xref cantaloupesrvcscmuedu talkreligionmisc836...


In [193]:
#dropping the 'data' column
df = df.drop(columns=['Data'])

In [194]:
df

Unnamed: 0,Labels,cleaned_data
0,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...
1,alt.atheism,newsgroups altatheism path cantaloupesrvcscmue...
2,alt.atheism,path cantaloupesrvcscmuedudasnewsharvardedunoc...
3,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...
4,alt.atheism,xref cantaloupesrvcscmuedu altatheism53485 tal...
...,...,...
1995,talk.religion.misc,xref cantaloupesrvcscmuedu talkabortion120945 ...
1996,talk.religion.misc,xref cantaloupesrvcscmuedu talkreligionmisc837...
1997,talk.religion.misc,xref cantaloupesrvcscmuedu talkorigins41030 ta...
1998,talk.religion.misc,xref cantaloupesrvcscmuedu talkreligionmisc836...


### Perform feature extraction to convert text data into a format that can be used by the Naive Bayes model, using techniques such as TF-IDF.

In [196]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()  # You can adjust the max_features as needed

# Fit and transform the cleaned text
X = vectorizer.fit_transform(df['cleaned_data'])

# Get the labels
y = df['Labels']

## 2. Naive Bayes Model for Text Classification

### Split the data into training and test sets.

In [199]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [200]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1400, 56432), (600, 56432), (1400,), (600,))

### Implement a Naive Bayes classifier to categorize the blog posts into their respective categories

In [206]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Naive Bayes model
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions on the test set
ytrain_predict = nb_model.predict(X_train)
ytest_pred = nb_model.predict(X_test)

In [219]:
ytest_pred

array(['alt.atheism', 'comp.sys.ibm.pc.hardware', 'sci.med',
       'rec.sport.baseball', 'sci.electronics', 'sci.electronics',
       'rec.sport.baseball', 'talk.politics.mideast', 'alt.atheism',
       'sci.med', 'alt.atheism', 'sci.crypt', 'sci.crypt',
       'rec.motorcycles', 'comp.sys.ibm.pc.hardware',
       'comp.os.ms-windows.misc', 'rec.autos', 'comp.graphics',
       'talk.politics.guns', 'talk.politics.misc', 'misc.forsale',
       'alt.atheism', 'alt.atheism', 'rec.sport.hockey', 'alt.atheism',
       'sci.crypt', 'sci.crypt', 'rec.sport.baseball', 'rec.autos',
       'alt.atheism', 'sci.crypt', 'rec.sport.hockey', 'comp.windows.x',
       'rec.sport.hockey', 'rec.sport.hockey', 'comp.sys.mac.hardware',
       'sci.med', 'sci.electronics', 'rec.sport.hockey',
       'comp.os.ms-windows.misc', 'sci.electronics',
       'soc.religion.christian', 'comp.os.ms-windows.misc', 'sci.crypt',
       'soc.religion.christian', 'rec.motorcycles', 'comp.windows.x',
       'rec.motorcycl

## 3. Sentiment Analysis

In [20]:
!pip install nltk



In [225]:
nltk.download('vader_lexicon', quiet=True)

True

In [22]:
! pip install vaderSentiment



## Performing sentiment analysis

In [242]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Download VADER lexicon if not already downloaded
nltk.download('vader_lexicon', quiet = True)

# Initialize SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Sentiment analysis function using VADER
def get_sentiment(text):
    sentiment_score = analyzer.polarity_scores(text)['compound']  # Get compound score

    if sentiment_score > 0:
        return 'positive'
    elif sentiment_score < 0:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to the 'cleaned_data' column
df['sentiment'] = df['cleaned_data'].apply(get_sentiment)

In [243]:
polarity_scores = analyzer.polarity_scores(df.loc[0]['cleaned_data'])
polarity_scores

{'neg': 0.204, 'neu': 0.645, 'pos': 0.151, 'compound': -0.9896}

### Analyze the sentiments expressed in the blog posts and categorize them as positive, negative, or neutral

In [253]:
df[['cleaned_data', 'sentiment']]

Unnamed: 0,cleaned_data,sentiment
0,path cantaloupesrvcscmuedumagnesiumclubcccmued...,negative
1,newsgroups altatheism path cantaloupesrvcscmue...,positive
2,path cantaloupesrvcscmuedudasnewsharvardedunoc...,negative
3,path cantaloupesrvcscmuedumagnesiumclubcccmued...,negative
4,xref cantaloupesrvcscmuedu altatheism53485 tal...,positive
...,...,...
1995,xref cantaloupesrvcscmuedu talkabortion120945 ...,positive
1996,xref cantaloupesrvcscmuedu talkreligionmisc837...,positive
1997,xref cantaloupesrvcscmuedu talkorigins41030 ta...,positive
1998,xref cantaloupesrvcscmuedu talkreligionmisc836...,positive


### Examine the distribution of sentiments across different categories 

In [26]:
sentiment_distribution = df.groupby(['Labels', 'sentiment']).size().unstack()

print(sentiment_distribution)

sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                   40.0      1.0      59.0
comp.graphics                 10.0      2.0      88.0
comp.os.ms-windows.misc       22.0      2.0      76.0
comp.sys.ibm.pc.hardware      19.0      1.0      80.0
comp.sys.mac.hardware         18.0      3.0      79.0
comp.windows.x                21.0      2.0      77.0
misc.forsale                   9.0      8.0      83.0
rec.autos                     27.0      1.0      72.0
rec.motorcycles               32.0      1.0      67.0
rec.sport.baseball            25.0      1.0      74.0
rec.sport.hockey              30.0      NaN      70.0
sci.crypt                     23.0      NaN      77.0
sci.electronics               13.0      3.0      84.0
sci.med                       28.0      1.0      71.0
sci.space                     29.0      3.0      68.0
soc.religion.christian        28.0      NaN      72.0
talk.politics.guns          

## 4. Evaluation

### Evaluate the performance of your Naive Bayes classifier using metrics such as accuracy, precision, recall, and F1-score.

#### Predicted Training Data

In [28]:
accuracy = accuracy_score(y_train, ytrain_predict)
report = classification_report(y_train, ytrain_predict)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9914285714285714
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.92      1.00      0.96        76
           comp.graphics       1.00      1.00      1.00        72
 comp.os.ms-windows.misc       1.00      1.00      1.00        67
comp.sys.ibm.pc.hardware       1.00      1.00      1.00        64
   comp.sys.mac.hardware       1.00      1.00      1.00        64
          comp.windows.x       1.00      1.00      1.00        64
            misc.forsale       1.00      1.00      1.00        76
               rec.autos       1.00      1.00      1.00        69
         rec.motorcycles       1.00      1.00      1.00        78
      rec.sport.baseball       1.00      1.00      1.00        68
        rec.sport.hockey       1.00      1.00      1.00        75
               sci.crypt       1.00      1.00      1.00        76
         sci.electronics       1.00      1.00      1.00        73
                 sci.me

#### Predicted Testing Data

In [29]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.7566666666666667
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.40      0.96      0.57        24
           comp.graphics       0.61      0.82      0.70        28
 comp.os.ms-windows.misc       0.90      0.82      0.86        33
comp.sys.ibm.pc.hardware       0.84      0.58      0.69        36
   comp.sys.mac.hardware       0.96      0.67      0.79        36
          comp.windows.x       1.00      0.56      0.71        36
            misc.forsale       0.62      0.83      0.71        24
               rec.autos       0.96      0.81      0.88        31
         rec.motorcycles       0.66      0.95      0.78        22
      rec.sport.baseball       0.97      0.91      0.94        32
        rec.sport.hockey       0.89      0.96      0.92        25
               sci.crypt       0.48      1.00      0.65        24
         sci.electronics       0.66      0.70      0.68        27
                 sci.me

### Model Performance:
#### Training Accuracy: 99.14% (Very high, suggesting good learning from the training data).
#### Test Accuracy: 75.67% (Lower, indicating possible overfitting to the training data).
### Challenges:
#### Overfitting: The model performs well on the training data but not as well on unseen test data.
#### Class Imbalance: Some categories are underrepresented, causing misclassification.
### Sentiment Analysis:
#### Sentiment Distribution: Most categories have a positive sentiment, followed by negative and neutral.
### Insights:
#### Positive Sentiment: Common in technical topics (e.g., comp.graphics, sci.crypt).
#### Negative Sentiment: Found in sensitive topics like alt.atheism and talk.politics.guns.


### Summary Highlights:
#### -> The model works well on training data but struggles with generalization to new data.
#### ->Sentiment analysis shows that the content of the blog posts influences sentiment distribution, with technical content being mostly
   ####   positive and political/religious content being more mixed.