In [17]:
import nltk
nltk.download('punkt_tab')
nltk.data.find('tokenizers/punkt')
# Install required libraries
!pip install pandas matplotlib seaborn nltk wordcloud gensim pyLDAvis

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# ✅ Fix: Manually download missing resources
nltk.download('vader_lexicon')
nltk.download('stopwords')

try:
    nltk.download('punkt')
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("⚠️ Error: 'punkt' tokenizer not found. Trying to download again...")
    nltk.download('punkt')
    nltk.data.find('tokenizers/punkt')

# Load dataset
file_path = "climate_nasa.csv"
df = pd.read_csv(file_path)

# Display column names
print("\n📌 Columns in dataset:", df.columns)

# Detect text column dynamically
text_column = None
for col in df.columns:
    if df[col].dtype == "object" and df[col].str.len().mean() > 50 and df[col].str.isalpha().any():
        text_column = col
        break

if text_column:
    print(f"\n✅ Using '{text_column}' for Sentiment & Topic Analysis")
else:
    print("\n⚠️ No suitable text column found! Skipping Sentiment & Topic Analysis.")

# ----------------- FIXED TOPIC MODELING ----------------- #
if text_column:
    from nltk.corpus import stopwords

    # ✅ Fix: Ensure punkt is downloaded before tokenization
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

    # Tokenization & Stopword Removal
    df['Processed_Text'] = df[text_column].apply(lambda x: word_tokenize(str(x).lower()))
    df['Processed_Text'] = df['Processed_Text'].apply(lambda x: [word for word in x if word.isalpha()])
    df['Processed_Text'] = df['Processed_Text'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])

    # Create Dictionary and Corpus for LDA
    dictionary = corpora.Dictionary(df['Processed_Text'])
    corpus = [dictionary.doc2bow(text) for text in df['Processed_Text']]

    # Train LDA Model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=10, random_state=42)

    # Display Topics
    print("\n📌 Top Topics Identified:")
    for idx, topic in lda_model.print_topics(-1):
        print(f"🔹 Topic {idx+1}: {topic}")

    # Visualize Topics
    pyLDAvis.enable_notebook()
    lda_visualization = gensimvis.prepare(lda_model, corpus, dictionary)
    try:
    # Attempt to display within the notebook
            display(lda_visualization)
    except Exception as e:
    # If display fails, save to HTML
             print("⚠️ Displaying in the notebook failed. Saving to 'lda_visualization.html'")
             pyLDAvis.save_html(lda_visualization, 'lda_visualization.html')




    pyLDAvis.display(lda_visualization)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!




[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



📌 Columns in dataset: Index(['date', 'likesCount', 'profileName', 'commentsCount', 'text'], dtype='object')

✅ Using 'text' for Sentiment & Topic Analysis

📌 Top Topics Identified:
🔹 Topic 1: 0.012*"climate" + 0.011*"carbon" + 0.009*"change" + 0.009*"atmosphere" + 0.009*"dioxide" + 0.008*"years" + 0.008*"global" + 0.008*"water" + 0.008*"ice" + 0.008*"year"
🔹 Topic 2: 0.020*"climate" + 0.013*"change" + 0.009*"global" + 0.008*"planet" + 0.008*"warming" + 0.006*"earth" + 0.006*"people" + 0.005*"one" + 0.005*"human" + 0.005*"also"
🔹 Topic 3: 0.009*"years" + 0.009*"earth" + 0.008*"would" + 0.008*"climate" + 0.007*"change" + 0.006*"sun" + 0.006*"global" + 0.005*"like" + 0.005*"warming" + 0.005*"us"
🔹 Topic 4: 0.009*"earth" + 0.009*"nan" + 0.007*"nasa" + 0.007*"planet" + 0.006*"one" + 0.005*"energy" + 0.004*"need" + 0.004*"save" + 0.004*"time" + 0.004*"would"
