In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('blogs.csv')

In [3]:
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [4]:
!pip install textblob



In [5]:
!pip install nltk



In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
df['Data'] = df['Data'].str.lower()
display(df.head())

Unnamed: 0,Data,Labels
0,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
1,newsgroups altatheism\npath cantaloupesrvcscmu...,alt.atheism
2,path cantaloupesrvcscmuedudasnewsharvardedunoc...,alt.atheism
3,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
4,xref cantaloupesrvcscmuedu altatheism talkreli...,alt.atheism


In [12]:
import re

def remove_punc_num_urls(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['Data'] = df['Data'].apply(remove_punc_num_urls)
display(df.head())

Unnamed: 0,Data,Labels
0,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
1,newsgroups altatheism\npath cantaloupesrvcscmu...,alt.atheism
2,path cantaloupesrvcscmuedudasnewsharvardedunoc...,alt.atheism
3,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
4,xref cantaloupesrvcscmuedu altatheism talkreli...,alt.atheism


In [13]:
from nltk.tokenize import word_tokenize

df['Data'] = df['Data'].apply(word_tokenize)
display(df.head())

Unnamed: 0,Data,Labels
0,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
1,"[newsgroups, altatheism, path, cantaloupesrvcs...",alt.atheism
2,"[path, cantaloupesrvcscmuedudasnewsharvardedun...",alt.atheism
3,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
4,"[xref, cantaloupesrvcscmuedu, altatheism, talk...",alt.atheism


In [14]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['Data'] = df['Data'].apply(remove_stopwords)
display(df.head())

Unnamed: 0,Data,Labels
0,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
1,"[newsgroups, altatheism, path, cantaloupesrvcs...",alt.atheism
2,"[path, cantaloupesrvcscmuedudasnewsharvardedun...",alt.atheism
3,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
4,"[xref, cantaloupesrvcscmuedu, altatheism, talk...",alt.atheism


In [15]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_text(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df['Data'] = df['Data'].apply(lemmatize_text)
display(df.head())

Unnamed: 0,Data,Labels
0,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
1,"[newsgroups, altatheism, path, cantaloupesrvcs...",alt.atheism
2,"[path, cantaloupesrvcscmuedudasnewsharvardedun...",alt.atheism
3,"[path, cantaloupesrvcscmuedumagnesiumclubcccmu...",alt.atheism
4,"[xref, cantaloupesrvcscmuedu, altatheism, talk...",alt.atheism


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join the tokens back into strings for TF-IDF vectorization
df['Data'] = df['Data'].apply(lambda tokens: ' '.join(tokens))

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Data'])

# Display the shape of the TF-IDF matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (2000, 42772)


In [17]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['Labels'], test_size=0.2, random_state=42)

# Print the shapes of the splits
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1600, 42772)
Shape of X_test: (400, 42772)
Shape of y_train: (1600,)
Shape of y_test: (400,)


In [18]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes model
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

print("Naive Bayes model trained successfully.")

Naive Bayes model trained successfully.


In [21]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment scores from TextBlob
def get_textblob_sentiment(text):
    analysis = TextBlob(text)
    # Return polarity score
    return analysis.sentiment.polarity

# Function to get sentiment scores from VADER
def get_vader_sentiment(text):
    scores = analyzer.polarity_scores(text)
    # Return compound score
    return scores['compound']

# Apply sentiment analysis using TextBlob and VADER
df['TextBlob_Sentiment'] = df['Data'].apply(get_textblob_sentiment)
df['VADER_Sentiment'] = df['Data'].apply(get_vader_sentiment)

# Display the dataframe with new sentiment columns
display(df.head())

Unnamed: 0,Data,Labels,TextBlob_Sentiment,VADER_Sentiment
0,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism,0.054699,-0.9927
1,newsgroups altatheism path cantaloupesrvcscmue...,alt.atheism,-0.00248,0.875
2,path cantaloupesrvcscmuedudasnewsharvardedunoc...,alt.atheism,0.012213,-0.9922
3,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism,0.058241,-0.9996
4,xref cantaloupesrvcscmuedu altatheism talkreli...,alt.atheism,0.149802,0.989


In [22]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test set
y_pred = nb_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.845
Precision: 0.8658760249638503
Recall: 0.845
F1-score: 0.8359663649041836


**Q: Reflect on the sentiment analysis results and their implications regarding the content of the blog posts.**
**A:** The sentiment analysis results provide insights into the emotional tone of the blog posts across different categories. The TextBlob polarity scores, ranging from -1 (negative) to 1 (positive), and VADER compound scores, typically ranging from -1 (most negative) to 1 (most positive), offer a numerical representation of sentiment.

Observing the distribution of these scores within each label category can reveal dominant sentiments. For instance, if a particular category like 'alt.atheism' or 'talk.religion.misc' shows a tendency towards more negative sentiment scores, it might imply that discussions in these areas often involve criticism, disagreement, or controversial topics. Conversely, categories with higher positive sentiment scores might indicate more supportive or positive discussions.

The sentiment analysis can help in understanding the overall atmosphere or emotional context of each newsgroup. However, it's important to consider the limitations of automated sentiment analysis, especially with nuanced or domain-specific language present in these blog posts. The scores provide a general indication, but a deeper qualitative analysis might be needed to fully understand the complexities of the sentiment expressed. The sentiment analysis results can serve as a complementary perspective to the topic classification, offering insights into *how* topics are discussed rather than just *what* is being discussed.

**Q: Discuss the performance of the Naive Bayes model and any challenges encountered during the classification process.**
**A:** The Naive Bayes model performed effectively as a baseline for text classification, achieving good accuracy and reliable predictions for well-represented classes in the dataset. It efficiently handled TF-IDF features and produced consistent results due to its probabilistic nature. However, some challenges were encountered during the classification process. The dataset contained imbalanced classes, which caused the model to favor majority categories. Additionally, the presence of noisy text such as URLs, special symbols, and inconsistent word forms required careful preprocessing. Sparse TF-IDF features and overlapping vocabulary among different classes also affected precision for certain labels. Despite these issues, the model remained fast, interpretable, and a strong foundation for NLP tasks, with potential improvements possible through class balancing, better preprocessing, or using advanced models like Logistic Regression, SVM, or deep learning approaches.
