In [1]:
import pandas as pd
import random

# Define sample tweets for each sentiment category
positive_tweets = [
    "Vaccines save lives! Get vaccinated and protect your community.",
    "Mental health matters. Take time to care for yourself and others.",
    "Regular exercise and a balanced diet lead to a healthier life.",
    "Public health campaigns are spreading awareness effectively!",
    "Grateful for healthcare workers who tirelessly help those in need.",
]

negative_tweets = [
    "The hospital wait times are getting worse. This is unacceptable!",
    "Fake news about medicine is spreading faster than facts. Scary!",
    "Pollution levels are rising, and nothing is being done about it!",
    "Public healthcare is underfunded. We need better policies now!",
    "People are still refusing vaccines, putting others at risk. Frustrating!",
]

neutral_tweets = [
    "Health officials are discussing the new policies for next year.",
    "More research is being done on nutrition and its effects on health.",
    "A study shows that exercise improves mental well-being.",
    "New statistics on public health funding have been released today.",
    "Experts are analyzing the effects of the flu season on hospitals.",
]




In [2]:
# Generate a balanced dataset
num_samples = 1000
tweets = []
sentiments = []

for _ in range(num_samples // 3):
    tweets.append(random.choice(positive_tweets))
    sentiments.append("Positive")

    tweets.append(random.choice(negative_tweets))
    sentiments.append("Negative")

    tweets.append(random.choice(neutral_tweets))
    sentiments.append("Neutral")



In [4]:
# Shuffle the dataset to ensure randomness
data = list(zip(tweets, sentiments))
random.shuffle(data)
tweets, sentiments = zip(*data)

# Create DataFrame
df = pd.DataFrame({"Tweet": tweets, "Sentiment": sentiments})

# Save as CSV file
df.to_csv("public_health_tweets.csv", index=False)

print("Balanced dataset saved as 'public_health_tweets.csv'")

Balanced dataset saved as 'public_health_tweets.csv'


In [5]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [6]:
# Load dataset
df = pd.read_csv("public_health_tweets.csv")
df.head(10)


Unnamed: 0,Tweet,Sentiment
0,Grateful for healthcare workers who tirelessly...,Positive
1,The hospital wait times are getting worse. Thi...,Negative
2,"Pollution levels are rising, and nothing is be...",Negative
3,Health officials are discussing the new polici...,Neutral
4,Mental health matters. Take time to care for y...,Positive
5,A study shows that exercise improves mental we...,Neutral
6,New statistics on public health funding have b...,Neutral
7,Mental health matters. Take time to care for y...,Positive
8,The hospital wait times are getting worse. Thi...,Negative
9,Mental health matters. Take time to care for y...,Positive


In [7]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@w+|\#', '', text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

In [8]:
# Apply cleaning
df['Tweet'] = df['Tweet'].apply(clean_text)

In [9]:
# Encode Sentiments (Convert categorical labels to numerical)
df['Sentiment'] = df['Sentiment'].map({'Positive': 1, 'Negative': 0, 'Neutral': 2})


In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Sentiment'], test_size=0.2, random_state=42)


In [11]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocab to 5000 words
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": MultinomialNB()
}

In [13]:
# Train & Evaluate
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    print(f"\n{name} Model:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))


Logistic Regression Model:
Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        64
           1       1.00      1.00      1.00        68
           2       1.00      1.00      1.00        68

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Random Forest Model:
Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        64
           1       1.00      1.00      1.00        68
           2       1.00      1.00      1.00        68

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Naive Bayes Model:
Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score  