# NASA Climate Comments — Sentiment Analysis (Fixed Version)

This notebook analyzes public sentiment from NASA Climate Facebook comments (2020–2023) using VADER and Logistic Regression. It has been fixed for your dataset where the text column is named `text` (lowercase).


In [3]:
# Install dependencies if missing
# !pip install pandas numpy matplotlib seaborn scikit-learn nltk vaderSentiment wordcloud tqdm joblib


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load local dataset
DATA_FILENAME = "climate_nasa (3).csv"
df = pd.read_csv(r"C:\Users\racha\Downloads\climate_nasa (3).csv")
print('Loaded dataset:', df.shape)
print('Columns:', df.columns.tolist())
df.head()


TypeError: read_csv() got an unexpected keyword argument 'errors'

In [None]:
# Quick EDA
print('Missing values per column:')
print(df.isnull().sum())

df['text_length'] = df['text'].astype(str).apply(len)
df['text_length'].describe()


In [None]:
# Clean text
import re, string

def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'http\S+|www\.\S+', '', s)
    s = re.sub(r'@\w+|#\w+', '', s)
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df['clean_text'] = df['text'].progress_map(clean_text)
df[['text','clean_text']].head()


In [None]:
# VADER sentiment labeling
analyzer = SentimentIntensityAnalyzer()

def vader_label(text):
    comp = analyzer.polarity_scores(text)['compound']
    if comp >= 0.05:
        return 'positive'
    elif comp <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['vader_compound'] = df['clean_text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
df['sentiment'] = df['clean_text'].apply(vader_label)

print('Sentiment distribution:')
print(df['sentiment'].value_counts())


In [None]:
# Visualization
sns.countplot(x='sentiment', data=df, order=['positive','neutral','negative'])
plt.title('Sentiment Distribution')
plt.show()


In [None]:
# Wordclouds (optional)
try:
    from wordcloud import WordCloud
    for label in ['positive','neutral','negative']:
        text_data = ' '.join(df[df['sentiment']==label]['clean_text'].astype(str))
        if not text_data.strip():
            continue
        wc = WordCloud(width=800, height=400, background_color='white').generate(text_data)
        plt.figure(figsize=(10,5))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'WordCloud for {label}')
        plt.show()
except Exception as e:
    print('Wordcloud skipped:', e)


In [None]:
# Train simple ML classifier
X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print('Model and vectorizer saved.')


In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=['positive','neutral','negative'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive','neutral','negative'], yticklabels=['positive','neutral','negative'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# Save predictions
df['predicted_sentiment'] = model.predict(vectorizer.transform(df['clean_text']))
df[['date','profileName','text','sentiment','predicted_sentiment']].to_csv('predictions.csv', index=False)
print('Predictions saved to predictions.csv')
df[['text','sentiment','predicted_sentiment']].sample(10)
