## TEXT CLASSIFICATION USING NAIVE BAYES AND SENTIMENT ANALYSIS ON BLOG POSTS

In [None]:
# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

In [None]:
# Download required nltk data

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanket\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanket\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sanket\AppData\Roaming\nltk_data...


True

### Data Exploration and Preprocessing

In [None]:
# Load the dataset

In [2]:
df=pd.read_csv(r'C:\Users\sanket\Desktop\DS ASSIGNMENT\New folder\blogs.csv')
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [5]:
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [None]:
# Check distribution of categories

In [7]:
df['Labels'].value_counts()

alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: Labels, dtype: int64

### Text Preprocessing

In [None]:
# Initialize the lemmatizer and get stopwords

In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [None]:
# Function to clean and preprocess text

In [9]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and non-alphanumeric characters
    tokens = word_tokenize(text) # Tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords and lemmatize each token
    return " ".join(tokens)  # Join tokens back into a single string

In [None]:
# Apply preprocessing to the Data column

In [10]:
df['Cleaned_Data'] = df['Data'].apply(preprocess_text)
df['Cleaned_Data'].head()

0    path cantaloupesrvcscmuedumagnesiumclubcccmued...
1    newsgroups altatheism path cantaloupesrvcscmue...
2    path cantaloupesrvcscmuedudasnewsharvardedunoc...
3    path cantaloupesrvcscmuedumagnesiumclubcccmued...
4    xref cantaloupesrvcscmuedu altatheism53485 tal...
Name: Cleaned_Data, dtype: object

### Feature Extraction with TF-IDF

In [None]:
# Convert text data to TF-IDF features

In [12]:
tfidf = TfidfVectorizer(max_features=5000)  
x= tfidf.fit_transform(df['Cleaned_Data'])
y = df['Labels']  #Target variable

In [13]:
x

<2000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 189407 stored elements in Compressed Sparse Row format>

In [14]:
y

0              alt.atheism
1              alt.atheism
2              alt.atheism
3              alt.atheism
4              alt.atheism
               ...        
1995    talk.religion.misc
1996    talk.religion.misc
1997    talk.religion.misc
1998    talk.religion.misc
1999    talk.religion.misc
Name: Labels, Length: 2000, dtype: object

## Naive Bayes Model for Text Classification

In [None]:
# Split the Data

In [15]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
# Initialize the Naive Bayes classifier
nb_model = MultinomialNB()

In [19]:
# Train the model
nb_model.fit(x_train, y_train)

In [20]:
#Make predictions
y_pred = nb_model.predict(x_test)

In [21]:
y_pred

array(['talk.politics.misc', 'comp.sys.ibm.pc.hardware', 'sci.med',
       'rec.sport.baseball', 'sci.electronics', 'sci.electronics',
       'rec.sport.baseball', 'talk.politics.mideast', 'alt.atheism',
       'sci.med', 'alt.atheism', 'sci.med', 'sci.crypt', 'comp.windows.x',
       'comp.sys.ibm.pc.hardware', 'comp.os.ms-windows.misc', 'rec.autos',
       'comp.graphics', 'talk.politics.guns', 'talk.politics.misc',
       'comp.sys.mac.hardware', 'alt.atheism', 'alt.atheism',
       'rec.sport.hockey', 'alt.atheism', 'sci.crypt',
       'talk.politics.misc', 'rec.sport.baseball', 'rec.autos',
       'alt.atheism', 'rec.sport.baseball', 'rec.sport.baseball',
       'comp.windows.x', 'rec.sport.baseball', 'rec.sport.hockey',
       'comp.sys.mac.hardware', 'sci.med', 'sci.electronics',
       'rec.sport.hockey', 'comp.os.ms-windows.misc', 'sci.electronics',
       'soc.religion.christian', 'comp.os.ms-windows.misc',
       'comp.windows.x', 'soc.religion.christian', 'rec.motorcycles',

## Sentiment Analysis

In [None]:
# Sentiment Analysis using TextBlob

In [25]:
!pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
    --------------------------------------- 10.2/626.3 kB ? eta -:--:--
   - ------------------------------------- 30.7/626.3 kB 435.7 kB/s eta 0:00:02
   -- ------------------------------------ 41.0/626.3 kB 326.8 kB/s eta 0:00:02
   ----- --------------------------------- 92.2/626.3 kB 585.1 kB/s eta 0:00:01
   ------- ------------------------------ 122.9/626.3 kB 654.9 kB/s eta 0:00:01
   -------------- ----------------------- 245.8/626.3 kB 942.1 kB/s eta 0:00:01
   ----------------- -------------------- 286.7/626.3 kB 983.9 kB/s eta 0:00:01
   ------------------------ --------------- 389.1/626.3 kB 1.1 MB/s eta 0:00:01
   ----------------------------- ---------- 460.8/626.3 kB 1.2 MB/s 

In [27]:
from textblob import TextBlob

In [None]:
# Function to classify sentiment

In [29]:
def get_sentiment(text):
    blob = TextBlob(text)
    if blob.sentiment.polarity > 0: # Determine sentiment polarity
        return "positive"
    elif blob.sentiment.polarity < 0:
        return "negative"
    else:
        return "neutral"

In [None]:
# Apply sentiment analysis on the "Data" column

In [30]:
df['Sentiment'] = df['Data'].apply(get_sentiment)
df[['Data', 'Sentiment']].head()

Unnamed: 0,Data,Sentiment
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,positive
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,negative
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,positive
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,positive
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,positive


In [None]:
# Distribution of sentiments across categories

In [31]:
sentiment_dist = df.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack()
sentiment_dist

Sentiment,negative,positive
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
alt.atheism,0.23,0.77
comp.graphics,0.24,0.76
comp.os.ms-windows.misc,0.22,0.78
comp.sys.ibm.pc.hardware,0.2,0.8
comp.sys.mac.hardware,0.24,0.76
comp.windows.x,0.27,0.73
misc.forsale,0.16,0.84
rec.autos,0.17,0.83
rec.motorcycles,0.26,0.74
rec.sport.baseball,0.29,0.71


## Evaluation

In [None]:
# Accuracy, Precision, Recall, and F1 Score

In [32]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [None]:
# Print the metrics

In [33]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.83
Precision: 0.8400442312026264
Recall: 0.83
F1 Score: 0.8224973778455688


In [34]:
# Detailed classification report
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.53      0.89      0.67        18
           comp.graphics       0.83      0.83      0.83        18
 comp.os.ms-windows.misc       0.78      0.82      0.80        22
comp.sys.ibm.pc.hardware       0.77      0.80      0.78        25
   comp.sys.mac.hardware       0.86      0.90      0.88        21
          comp.windows.x       0.91      0.84      0.87        25
            misc.forsale       0.92      0.67      0.77        18
               rec.autos       0.94      0.94      0.94        18
         rec.motorcycles       0.83      0.94      0.88        16
      rec.sport.baseball       0.78      1.00      0.88        18
        rec.sport.hockey       0.94      1.00      0.97        15
               sci.crypt       0.86      0.95      0.90        19
         sci.electronics       0.75      0.75      0.75        16
                 sci.med       0.88      0.88      0.88        17
         

In [None]:
# Print sentiment distribution summary

In [35]:
print("Sentiment Distribution by Category:")
print(sentiment_dist)

Sentiment Distribution by Category:
Sentiment                 negative  positive
Labels                                      
alt.atheism                   0.23      0.77
comp.graphics                 0.24      0.76
comp.os.ms-windows.misc       0.22      0.78
comp.sys.ibm.pc.hardware      0.20      0.80
comp.sys.mac.hardware         0.24      0.76
comp.windows.x                0.27      0.73
misc.forsale                  0.16      0.84
rec.autos                     0.17      0.83
rec.motorcycles               0.26      0.74
rec.sport.baseball            0.29      0.71
rec.sport.hockey              0.34      0.66
sci.crypt                     0.19      0.81
sci.electronics               0.19      0.81
sci.med                       0.29      0.71
sci.space                     0.27      0.73
soc.religion.christian        0.13      0.87
talk.politics.guns            0.30      0.70
talk.politics.mideast         0.22      0.78
talk.politics.misc            0.22      0.78
talk.religion.misc 