In [23]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [34]:
data = pd.read_csv(r'C:\Users\Admin\Desktop\data science assighnments\blogs.csv')

In [35]:

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
data.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [37]:
# Remove any missing values
data.dropna(subset=['Data', 'Labels'], inplace=True)


In [38]:

# Remove unwanted characters (punctuation, numbers, etc.)
data['Data'] = data['Data'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))


In [39]:
# Convert text to lowercase
data['Data'] = data['Data'].apply(lambda x: x.lower())


In [40]:

# Check the cleaned data
print(data['Data'].head())

0    path cantaloupesrvcscmuedumagnesiumclubcccmued...
1    newsgroups altatheism\npath cantaloupesrvcscmu...
2    path cantaloupesrvcscmuedudasnewsharvardedunoc...
3    path cantaloupesrvcscmuedumagnesiumclubcccmued...
4    xref cantaloupesrvcscmuedu altatheism talkreli...
Name: Data, dtype: object


In [41]:
# Ensure each text value is a string before processing
def preprocess_text(text):
    # Ensure the input is a string
    text = str(text)
    
    # Tokenization
    words = nltk.word_tokenize(text.lower())  # Convert to lowercase and tokenize
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]  # Remove non-alphanumeric and stopwords
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)



In [42]:
data.columns

Index(['Data', 'Labels'], dtype='object')

In [43]:

# Apply preprocessing to the dataset
data['cleaned_text'] = data['Data'].apply(preprocess_text)


In [44]:
# Generate pseudo-labels using a pre-trained model (e.g., VADER, TextBlob)
from nltk.sentiment import SentimentIntensityAnalyzer

In [45]:

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [46]:
def assign_sentiment(text):
    score = sia.polarity_scores(text)['compound']
    return 1 if score > 0 else 0  # 1 for positive, 0 for negative


In [48]:
data['sentiment'] = data['cleaned_text'].apply(assign_sentiment)

In [49]:
# Feature Extraction
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['sentiment']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Initialize Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

In [53]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [54]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.7200


In [55]:
# Detailed classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.18      0.30       130
           1       0.71      0.98      0.82       270

    accuracy                           0.72       400
   macro avg       0.76      0.58      0.56       400
weighted avg       0.74      0.72      0.65       400



In [57]:
# Assuming the text data is in a column named 'text'
if 'Data' not in data.columns:
    print("Column 'text' not found. Please update the column name in the code.")
else:
    # Initialize the sentiment analyzer
    sia = SentimentIntensityAnalyzer()


In [58]:

    # Define a function to categorize sentiment
    def categorize_sentiment(text):
        scores = sia.polarity_scores(text)
        compound_score = scores['compound']
        if compound_score > 0.05:
            return 'positive'
        elif compound_score < -0.05:
            return 'negative'
        else:
            return 'neutral'


In [60]:

    # Apply the function to the dataset
    data['sentiment'] = data['Data'].apply(categorize_sentiment)


In [63]:
# Display the results
print(data[['Data', 'sentiment']].head())

                                                Data sentiment
0  path cantaloupesrvcscmuedumagnesiumclubcccmued...  negative
1  newsgroups altatheism\npath cantaloupesrvcscmu...  positive
2  path cantaloupesrvcscmuedudasnewsharvardedunoc...  negative
3  path cantaloupesrvcscmuedumagnesiumclubcccmued...  negative
4  xref cantaloupesrvcscmuedu altatheism talkreli...  positive
