In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from textblob import TextBlob

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\udits\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\udits\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\udits\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [15]:
# Load the Dataset
df= pd.read_csv('blogs.csv')
df.head()


Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None


In [5]:
print(df['Labels'].value_counts())

Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: count, dtype: int64


In [14]:
# Preprocessing
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
        return " ".join(words)
    else:
        return ""  # Return an empty string for non-string values

df['Data'] = df['Data'].apply(preprocess_text)

In [16]:
# Feature Extraction 
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Data'])
y = df['Labels']

In [17]:
# Split the Data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
#Implement and Train the Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

In [20]:
# Sentiment Analysis using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Data'].apply(get_sentiment)

In [21]:
# Examine Sentiment Distribution
print(df['Sentiment'].value_counts())
print(df.groupby('Labels')['Sentiment'].value_counts())

Sentiment
positive    1543
negative     457
Name: count, dtype: int64
Labels                    Sentiment
alt.atheism               positive     77
                          negative     23
comp.graphics             positive     76
                          negative     24
comp.os.ms-windows.misc   positive     78
                          negative     22
comp.sys.ibm.pc.hardware  positive     80
                          negative     20
comp.sys.mac.hardware     positive     76
                          negative     24
comp.windows.x            positive     73
                          negative     27
misc.forsale              positive     84
                          negative     16
rec.autos                 positive     83
                          negative     17
rec.motorcycles           positive     74
                          negative     26
rec.sport.baseball        positive     71
                          negative     29
rec.sport.hockey          positive     66
            

Sentiment Distribution
1) Overall Sentiment:
- The dataset exhibits a clear tendency towards positive sentiment, with a significantly larger proportion of blog posts categorized as positive compared to negative.
- This suggests that, in general, the content within these blogs leans towards expressing favorable opinions or experiences.

2) Sentiment Variation Across Categories:
- While overall sentiment is positive, there's substantial variation across different blog categories.
- Categories like "soc.religion.christian" and "rec.autos" display a notably higher concentration of positive sentiment, indicating that discussions within these areas tend to be more optimistic.
- Conversely, categories such as "alt.atheism" and "talk.politics.guns" show a relatively higher presence of negative sentiment, reflecting the potentially contentious nature of these topics.

In [22]:
# Evaluate Naive Bayes Classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.73
                          precision    recall  f1-score   support

             alt.atheism       0.58      0.83      0.68        18
           comp.graphics       0.81      0.72      0.76        18
 comp.os.ms-windows.misc       0.77      0.91      0.83        22
comp.sys.ibm.pc.hardware       0.75      0.84      0.79        25
   comp.sys.mac.hardware       0.83      0.48      0.61        21
          comp.windows.x       1.00      0.16      0.28        25
            misc.forsale       1.00      0.72      0.84        18
               rec.autos       0.76      0.89      0.82        18
         rec.motorcycles       0.87      0.81      0.84        16
      rec.sport.baseball       0.94      0.83      0.88        18
        rec.sport.hockey       0.62      1.00      0.77        15
               sci.crypt       0.58      1.00      0.73        19
         sci.electronics       0.77      0.62      0.69        16
                 sci.med       0.81      0.76      0.79     

Naive Bayes Classification Performance

The Naive Bayes classifier achieved an overall accuracy of 0.73 (73%).

- Strengths: The model performs well for categories like comp.os.ms-windows.misc, rec.sport.baseball, sci.space, and talk.politics.mideast, exhibiting high precision, recall, and F1-scores.
- Weaknesses: Performance is poor for categories like comp.windows.x, talk.religion.misc, and comp.sys.mac.hardware with very low precision and recall.
- Implications: The model is effective for certain topics but struggles with others, potentially due to data imbalances or topic complexity. Further analysis is needed to address the low-performing categories.

Challenges encountered during classification:

- Data Imbalance: Uneven category representation can bias results.
- Topic Overlap: Similar vocabulary across categories hinders distinction.
- Language Ambiguity: Informal language and context confuse the model.
- Limited Data: Insufficient training samples affect category learning.
- TF-IDF Limitations: Ignores semantic relationships and word order.
- Preprocessing Loss: Essential information may be removed.
- Naive Bayes Assumptions: Feature independence doesn't always hold.