In [None]:
# Cell 1: Installations
pip install pandas
pip install langchain_community
pip install replicate
pip install tqdm

In [None]:
# Cell 2: API and Model Setup
import os
import pandas as pd
from google.colab import userdata
from langchain_community.llms import Replicate
from tqdm import tqdm

# Load the API key from Colab Secrets
try:
    api_token = userdata.get('api_token')
    os.environ[""] = api_token
    print("✅ REPLICATE_API_TOKEN loaded successfully.")
except Exception as e:
    print(f"🚨 Error loading API token: {e}")
    print("Please ensure you have a secret named 'REPLICATE_API_TOKEN' in your Colab secrets.")

# Initialize the IBM Granite model
# This will only work if the API token was loaded correctly
try:
    llm = Replicate(
      model="ibm-granite/granite-3.3-8b-instruct",
      model_kwargs={"temperature": 0.1, "max_new_tokens": 100} # Control output
    )
    print("✅ IBM Granite model initialized.")
except Exception as e:
    print(f"🚨 Error initializing the model: {e}")

In [None]:
# Cell 3: Load the Data
# The URL to the raw CSV file on GitHub
csv_url = 'https://raw.githubusercontent.com/nikjohn7/Disaster-Tweets-Kaggle/main/data/train.csv'

# Read the data into a pandas DataFrame
try:
    df = pd.read_csv(csv_url)
    print(f"✅ Successfully loaded {len(df)} total tweets.")

    # --- CRITICAL STEP FOR SPEED ---
    # Create a smaller sample to work with. We'll use 100 for this run.
    sample_size = 100
    df_sample = df.head(sample_size)
    print(f"🔥 Created a sample of {len(df_sample)} tweets to process.")

    # Display the first 5 rows of our sample to see what we're working with
    print("\n--- Data Preview ---")
    print(df_sample.head())

except Exception as e:
    print(f"🚨 Error loading data from URL: {e}")

In [None]:
# Cell 4: Part 1 - Data Classification
# Our goal: Assign a predefined category to each tweet to structure the chaos.

# Create an empty list to store our classification results
classification_results = []

# Define the classification prompt template
prompt_template = """
Analyze the tweet below and classify it into ONE of the following categories:
- 'Urgent Plea for Help': A direct request for assistance from someone in danger.
- 'Infrastructure Damage Report': A report of damage to bridges, roads, power lines, etc.
- 'Informational News/Update': A news link, official update, or general information.
- 'Irrelevant/Noise': A prayer, joke, or comment not related to the disaster itself.

Provide only the category name as your answer.

Tweet: "{tweet_text}"

Classification:
"""

# Use tqdm for a progress bar!
for index, row in tqdm(df_sample.iterrows(), total=df_sample.shape[0], desc="Classifying Tweets"):
    tweet_text = row['text']
    final_prompt = prompt_template.format(tweet_text=tweet_text)

    try:
        response = llm.invoke(final_prompt)
        classification = response.strip()

        classification_results.append({
            'tweet_id': row['id'],
            'tweet_text': tweet_text,
            'classification': classification
        })

    except Exception as e:
        print(f"🚨 Error classifying tweet {row['id']}: {e}")
        classification_results.append({
            'tweet_id': row['id'],
            'tweet_text': tweet_text,
            'classification': 'ERROR'
        })

# Convert the results list into a new DataFrame for the next steps
results_df = pd.DataFrame(classification_results)
print("\n✅ Classification phase complete.")

In [None]:
# Cell 5: Analyzing the Classification Results
import matplotlib.pyplot as plt
import seaborn as sns

print("--- 📊 INSIGHT FROM CLASSIFICATION ---")
print("This breakdown shows us where to focus our attention.\n")

# Get the counts of each category
classification_counts = results_df['classification'].value_counts()
print(classification_counts)

# --- Visualization ---
plt.figure(figsize=(10, 6))
sns.barplot(x=classification_counts.index, y=classification_counts.values, palette='viridis')
plt.title('Distribution of Disaster Tweet Classifications', fontsize=16)
plt.ylabel('Number of Tweets', fontsize=12)
plt.xlabel('Classification Category', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Cell 6: Part 2 - Data Summarization
# Our goal: Distill the most critical classified data into a high-level summary.

print("\n\n--- 📝 INITIATING SUMMARIZATION PHASE ---")

# First, isolate the tweets that are most important
# We'll focus on 'Urgent Plea for Help'
urgent_pleas_df = results_df[results_df['classification'] == 'Urgent Plea for Help']

# Check if we have any urgent pleas to summarize
if not urgent_pleas_df.empty:
    print(f"Found {len(urgent_pleas_df)} urgent pleas. Compiling for summary...")

    # Combine all the urgent tweets into a single block of text
    all_pleas_text = "\n".join("- " + tweet for tweet in urgent_pleas_df['tweet_text'])

    # Create a powerful summarization prompt
    summarization_prompt = f"""
You are an expert emergency services analyst. Your job is to provide a quick intelligence briefing.
Read the following collection of urgent pleas for help from a disaster zone and provide a concise summary in 3-4 bullet points.
Focus on identifying recurring themes, types of emergencies (e.g., trapped people, fires, injuries), and any specific locations mentioned.

Urgent Tweets:
{all_pleas_text}

Intelligence Briefing:
"""

    print("\nSending compiled pleas to IBM Granite for summarization...")
    try:
        # Get the summary from the LLM
        summary_response = llm.invoke(summarization_prompt)

        # --- 🚨 FINAL KEY INSIGHT 🚨 ---
        print("\n\n--- EXECUTIVE SUMMARY OF URGENT PLEAS ---")
        print(summary_response)

    except Exception as e:
        print(f"🚨 Error during summarization: {e}")

else:
    print("\nNo tweets were classified as 'Urgent Plea for Help' in this sample. Skipping summarization.")