<a href="https://colab.research.google.com/github/TejasVijaya74/Project-Infy-Chimera/blob/main/ProjectChimera.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Phase 1 : Initial environment setup and data collection pipeline**

​Step 1: Install Libraries & Mount Google Drive

In [4]:
# Install required Python libraries
# newsapi-python: A simple Python client for the News API
# tweepy: The official Python client for the X (formerly Twitter) API
# pandas: Useful for data handling and viewing (optional but recommended)
!pip install newsapi-python tweepy pandas

# Import necessary libraries
import os
import json
from datetime import datetime
from google.colab import drive
from google.colab import userdata # For securely accessing API keys

# Mount your Google Drive to the Colab environment
# This will prompt you for authorization the first time you run it.
drive.mount('/content/drive')

print("Libraries installed and Google Drive mounted successfully.")

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7
Mounted at /content/drive
Libraries installed and Google Drive mounted successfully.


Step 2: Securely Handling Your API Keys

In [5]:
# Access the API keys from Colab's secret manager
news_api_key = userdata.get('NEWS_API_KEY')
twitter_bearer_token = userdata.get('TWITTER_BEARER_TOKEN')

# A quick check to ensure keys are loaded
if news_api_key and twitter_bearer_token:
    print("API keys loaded successfully.")
else:
    print("ERROR: Could not find API keys. Please check your Colab Secrets settings.")

API keys loaded successfully.


Step 3: Data Collection from News API

In [6]:
from newsapi import NewsApiClient

def collect_google_news_data(api_key, query, folder_path):
    """
    Collects news data for a given query and saves it to a timestamped JSON file.
    """
    try:
        # Initialize the client
        newsapi = NewsApiClient(api_key=api_key)

        # Fetch the articles
        all_articles = newsapi.get_everything(
            q=query,
            language='en',
            sort_by='relevancy' # Options: relevancy, popularity, publishedAt
        )

        # Create the directory in Google Drive if it doesn't exist
        os.makedirs(folder_path, exist_ok=True)

        # Create a unique, timestamped filename
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        filename = f"{query.replace(' ', '_')}_{timestamp}.json"
        filepath = os.path.join(folder_path, filename)

        # Save the data to the file
        with open(filepath, 'w') as f:
            json.dump(all_articles['articles'], f, indent=4)

        print(f" Successfully collected {len(all_articles['articles'])} articles for '{query}'.")
        print(f"   Saved to: {filepath}")

    except Exception as e:
        print(f" An error occurred: {e}")

# --- Let's Run It! ---
# Define the path in your Google Drive where news data will be stored
news_save_path = '/content/drive/MyDrive/ProjectChimera/data/news'
# Define your search query
search_query = "Artificial Intelligence"

collect_google_news_data(news_api_key, search_query, news_save_path)

 Successfully collected 100 articles for 'Artificial Intelligence'.
   Saved to: /content/drive/MyDrive/ProjectChimera/data/news/Artificial_Intelligence_2025-09-04_05-59-14.json


Step 4: Data Collection from X (Twitter) API

In [9]:
import tweepy

def collect_twitter_data(bearer_token, query, folder_path):
    """
    Collects recent tweets for a given query and saves them to a timestamped JSON file.
    """
    try:
        # Initialize the client
        client = tweepy.Client(bearer_token)

        # Fetch recent tweets (free tier allows searching the last 7 days)
        response = client.search_recent_tweets(
            query=f"{query} -is:retweet",  # Search query, excluding retweets
            max_results=100,              # Max results per request (10-100)
            tweet_fields=["created_at", "public_metrics", "lang"]
        )

        if not response.data:
            print(f" No tweets found for the query: '{query}'")
            return

        # Prepare data for saving
        tweets_to_save = []
        for tweet in response.data:
            tweets_to_save.append({
                'id': tweet.id,
                'text': tweet.text,
                'created_at': str(tweet.created_at),
                'retweet_count': tweet.public_metrics['retweet_count'],
                'reply_count': tweet.public_metrics['reply_count'],
                'like_count': tweet.public_metrics['like_count'],
                'impression_count': tweet.public_metrics['impression_count']
            })

        # Create the directory in Google Drive if it doesn't exist
        os.makedirs(folder_path, exist_ok=True)

        # Create a unique, timestamped filename
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        filename = f"{query.replace(' ', '_')}_{timestamp}.json"
        filepath = os.path.join(folder_path, filename)

        # Save the data to the file
        with open(filepath, 'w') as f:
            json.dump(tweets_to_save, f, indent=4)

        print(f" Successfully collected {len(tweets_to_save)} tweets for '{query}'.")
        print(f"   Saved to: {filepath}")

    except Exception as e:
        print(f" An error occurred: {e}")

# --- Let's Run It! ---
# Define the path in your Google Drive where twitter data will be stored
twitter_save_path = '/content/drive/MyDrive/ProjectChimera/data/twitter'
# Define your search query (can be the same or different)
search_query = "Artificial Intelligence"

collect_twitter_data(twitter_bearer_token, search_query, twitter_save_path)

 An error occurred: 429 Too Many Requests
Usage cap exceeded: Monthly product cap


## **Phase 2 : Building the Trend & Alert System**



 Step 1: Installations and Secure Setup

In [None]:
# Install VADER for fast, rule-based sentiment analysis
# It's great for this stage because it's simple and effective.
!pip install vaderSentiment

import os
import json
from datetime import datetime, timedelta
import requests
from google.colab import userdata
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# --- SECURE SETUP ---
# 1. Go to the key icon (🔑) in the Colab sidebar.
# 2. Add a new secret:
#    Name: SLACK_WEBHOOK_URL
#    Value: [Paste your Slack Webhook URL here]
# 3. Make sure "Notebook access" is toggled ON.

SLACK_URL = userdata.get('SLACK_WEBHOOK_URL')
print(" Setup complete. VADER installed and Slack URL loaded.")

 Setup complete. VADER installed and Slack URL loaded.


Next Step: Testing Your Alert System (optional)

In [None]:
# --- ALERTING PARAMETERS (Temporarily adjusted for testing!) ---
KEYWORDS_TO_TRACK = {
    'funding': 3,      # Lowered from 5 to 3 (will now trigger an alert)
    'partnership': 1,  # Lowered from 3 to 1 (will now trigger an alert)
    'layoff': 0,
}

# Lower the sentiment threshold to a positive value to trigger it
SENTIMENT_THRESHOLD = 0.2
# --- HELPER FUNCTIONS ---

def send_slack_alert(message):
    """Sends a formatted message to our Slack channel."""
    if not SLACK_URL:
        print(" Slack URL not found. Cannot send alert.")
        return
    payload = {'text': message}
    try:
        response = requests.post(SLACK_URL, json=payload)
        if response.status_code == 200:
            print(" Slack alert sent successfully!")
    except Exception as e:
        print(f" Error sending Slack alert: {e}")

def analyze_sentiment_vader(text):
    """Analyzes sentiment of a text using VADER."""
    analyzer = SentimentIntensityAnalyzer()
    # The 'compound' score is a single metric from -1 (most negative) to +1 (most positive)
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

Step 2: Define the Alerting Logic and Helper Functions

In [None]:
# --- ALERTING PARAMETERS (You can tune these!) ---
# Define keywords that signal important strategic events
KEYWORDS_TO_TRACK = {
    'funding': 5,      # Alert if 'funding' is mentioned > 5 times
    'partnership': 3,  # Alert if 'partnership' is mentioned > 3 times
    'layoff': 2,       # Alert if 'layoff' is mentioned > 2 times
}

# Define a sentiment threshold
# We'll alert if the average sentiment drops below -0.1 (leaning negative)
SENTIMENT_THRESHOLD = -0.1

# --- HELPER FUNCTIONS ---

def send_slack_alert(message):
    """Sends a formatted message to our Slack channel."""
    if not SLACK_URL:
        print(" Slack URL not found. Cannot send alert.")
        return
    payload = {'text': message}
    try:
        response = requests.post(SLACK_URL, json=payload)
        if response.status_code == 200:
            print(" Slack alert sent successfully!")
    except Exception as e:
        print(f" Error sending Slack alert: {e}")

def analyze_sentiment_vader(text):
    """Analyzes sentiment of a text using VADER."""
    analyzer = SentimentIntensityAnalyzer()
    # The 'compound' score is a single metric from -1 (most negative) to +1 (most positive)
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

Step 3: The Main Function to Check for Trends and Send Alerts

In [None]:
def check_for_trends_and_alerts():
    """
    Main function to process recent data and trigger alerts.
    """
    print("\n--- Starting Trend Analysis ---\n")

    # --- 1. Load Recent Data (from the last 24 hours) ---
    all_text = []
    data_root_path = '/content/drive/MyDrive/ProjectChimera/data'
    yesterday = datetime.now() - timedelta(days=1)

    for data_type in ['news', 'twitter']:
        folder_path = os.path.join(data_root_path, data_type)
        if not os.path.exists(folder_path): continue

        for filename in os.listdir(folder_path):
            try:
                # Check if the file was created in the last 24 hours
                file_timestamp_str = filename.split('_')[-2] + "_" + filename.split('_')[-1].split('.')[0]
                file_timestamp = datetime.strptime(file_timestamp_str, '%Y-%m-%d_%H-%M-%S')

                if file_timestamp > yesterday:
                    filepath = os.path.join(folder_path, filename)
                    with open(filepath, 'r') as f:
                        data = json.load(f)
                        # Extract text from both news and twitter data structures
                        for item in data:
                            all_text.append(item.get('title', '') + " " + item.get('description', '') if data_type == 'news' else item.get('text', ''))
            except Exception as e:
                # This handles cases where filename format might be different
                continue

    if not all_text:
        print("No new data from the last 24 hours to analyze.")
        return

    print(f"Found {len(all_text)} new items from the last 24 hours.")
    full_text_corpus = " ".join(all_text).lower()

    # --- 2. Analyze for Keyword Surges ---
    print("\n Analyzing for keyword surges...")
    for keyword, threshold in KEYWORDS_TO_TRACK.items():
        count = full_text_corpus.count(keyword)
        print(f"  - Found '{keyword}' {count} times (Threshold: {threshold})")
        if count > threshold:
            alert_message = (f"🚨 *Keyword Surge Alert!* 🚨\n"
                             f"> The keyword `*{keyword}*` was mentioned `{count}` times in the last 24 hours, "
                             f"exceeding the threshold of `{threshold}`.\n"
                             f"> This may indicate a significant market event.")
            send_slack_alert(alert_message)

    # --- 3. Analyze for Sentiment Shifts ---
    print("\n Analyzing for sentiment shifts...")
    total_sentiment_score = 0
    for text in all_text:
        total_sentiment_score += analyze_sentiment_vader(text)

    average_sentiment = total_sentiment_score / len(all_text) if all_text else 0
    print(f"  - Average sentiment score: {average_sentiment:.4f} (Threshold: {SENTIMENT_THRESHOLD})")

    if average_sentiment < SENTIMENT_THRESHOLD:
        alert_message = (f" *Negative Sentiment Alert!* \n"
                         f"> The average sentiment score in the last 24 hours was `{average_sentiment:.4f}`, "
                         f"which is below the threshold of `{SENTIMENT_THRESHOLD}`.\n"
                         f"> This may indicate negative public perception or bad news.")
        send_slack_alert(alert_message)

    print("\n--- Analysis Complete ---")

# --- Let's Run It! ---
# This will perform the check on the data you've collected.
check_for_trends_and_alerts()


--- Starting Trend Analysis ---

Found 397 new items from the last 24 hours.

 Analyzing for keyword surges...
  - Found 'funding' 4 times (Threshold: 3)
 Slack alert sent successfully!
  - Found 'partnership' 2 times (Threshold: 1)
 Slack alert sent successfully!
  - Found 'layoff' 0 times (Threshold: 0)

 Analyzing for sentiment shifts...
  - Average sentiment score: 0.2646 (Threshold: 0.2)

--- Analysis Complete ---
