In [1]:
import pandas as pd

# Load local sentiment dataset
csv = 'sentiment.csv'
df_sentiment = pd.read_csv(csv)

df_sentiment.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [2]:
# Inspect sentiment dataset
print(df_sentiment.info())
print(df_sentiment.describe())
print(df_sentiment['sentiment'].value_counts())

import re
import string

def preprocess_text(text):
    # basic cleaning: lowercase, remove punctuation, collapse spaces
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", str(text))
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply preprocessing to sentiment dataset (assumes columns 'text' and 'sentiment')
df_sentiment['cleaned_text'] = df_sentiment['text'].apply(preprocess_text)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_sentiment['cleaned_text'])
y = df_sentiment['sentiment']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13871 entries, 0 to 13870
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         13871 non-null  int64  
 1   candidate                  13775 non-null  object 
 2   candidate_confidence       13871 non-null  float64
 3   relevant_yn                13871 non-null  object 
 4   relevant_yn_confidence     13871 non-null  float64
 5   sentiment                  13871 non-null  object 
 6   sentiment_confidence       13871 non-null  float64
 7   subject_matter             13545 non-null  object 
 8   subject_matter_confidence  13871 non-null  float64
 9   candidate_gold             28 non-null     object 
 10  name                       13871 non-null  object 
 11  relevant_yn_gold           32 non-null     object 
 12  retweet_count              13871 non-null  int64  
 13  sentiment_gold             15 non-null     obj

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.73      0.85      0.79      1722
     Neutral       0.48      0.35      0.41       612
    Positive       0.54      0.40      0.46       441

    accuracy                           0.67      2775
   macro avg       0.58      0.54      0.55      2775
weighted avg       0.65      0.67      0.65      2775



In [4]:
# Load local news category dataset (JSON lines)
news_file = 'News_Category_Dataset_v3.json'
df_news = pd.read_json(news_file, lines=True)

df_news.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [5]:
# Prepare news dataset for classification (use a fresh vectorizer)
df_news['cleaned_text'] = df_news['headline'].apply(preprocess_text)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_news = CountVectorizer()
X = vectorizer_news.fit_transform(df_news['cleaned_text'])
y = df_news['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

In [6]:
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

Naive Bayes Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                precision    recall  f1-score   support

          ARTS       0.56      0.02      0.03       293
ARTS & CULTURE       0.25      0.00      0.01       275
  BLACK VOICES       0.61      0.13      0.22       889
      BUSINESS       0.59      0.24      0.34      1216
       COLLEGE       0.50      0.00      0.01       202
        COMEDY       0.67      0.21      0.31      1022
         CRIME       0.54      0.48      0.51       713
CULTURE & ARTS       0.93      0.06      0.12       202
       DIVORCE       0.92      0.36      0.52       664
     EDUCATION       0.00      0.00      0.00       209
 ENTERTAINMENT       0.44      0.81      0.57      3419
   ENVIRONMENT       1.00      0.02      0.04       313
         FIFTY       0.00      0.00      0.00       263
  FOOD & DRINK       0.66      0.60      0.63      1270
     GOOD NEWS       0.44      0.01      0.03       270
         GREEN       0.41      0.05      0.08       532
HEALTHY LIVING       0.40      0.04      0.07  