<a href="https://colab.research.google.com/github/TejasVijaya74/Project-Infy-Chimera/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Phase 1 : Initial environment setup and data collection pipeline**

​Step 1: Install Libraries & Mount Google Drive

In [1]:
# Install required Python libraries
# newsapi-python: A simple Python client for the News API
# tweepy: The official Python client for the X (formerly Twitter) API
# pandas: Useful for data handling and viewing (optional but recommended)
!pip install newsapi-python tweepy pandas

# Import necessary libraries
import os
import json
from datetime import datetime
from google.colab import drive
from google.colab import userdata # For securely accessing API keys

# Mount your Google Drive to the Colab environment
# This will prompt you for authorization the first time you run it.
drive.mount('/content/drive')

print("Libraries installed and Google Drive mounted successfully.")

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: newsapi-python
Successfully installed newsapi-python-0.2.7
Mounted at /content/drive
Libraries installed and Google Drive mounted successfully.


Step 2: Securely Handling Your API Keys

In [2]:
# Access the API keys from Colab's secret manager
news_api_key = userdata.get('NEWS_API_KEY')
twitter_bearer_token = userdata.get('TWITTER_BEARER_TOKEN')

# A quick check to ensure keys are loaded
if news_api_key and twitter_bearer_token:
    print("API keys loaded successfully.")
else:
    print("ERROR: Could not find API keys. Please check your Colab Secrets settings.")

API keys loaded successfully.


Step 3: Data Collection from News API

In [3]:
from newsapi import NewsApiClient

def collect_google_news_data(api_key, query, folder_path):
    """
    Collects news data for a given query and saves it to a timestamped JSON file.
    """
    try:
        # Initialize the client
        newsapi = NewsApiClient(api_key=api_key)

        # Fetch the articles
        all_articles = newsapi.get_everything(
            q=query,
            language='en',
            sort_by='relevancy' # Options: relevancy, popularity, publishedAt
        )

        # Create the directory in Google Drive if it doesn't exist
        os.makedirs(folder_path, exist_ok=True)

        # Create a unique, timestamped filename
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        filename = f"{query.replace(' ', '_')}_{timestamp}.json"
        filepath = os.path.join(folder_path, filename)

        # Save the data to the file
        with open(filepath, 'w') as f:
            json.dump(all_articles['articles'], f, indent=4)

        print(f" Successfully collected {len(all_articles['articles'])} articles for '{query}'.")
        print(f"   Saved to: {filepath}")

    except Exception as e:
        print(f" An error occurred: {e}")

# --- Let's Run It! ---
# Define the path in your Google Drive where news data will be stored
news_save_path = '/content/drive/MyDrive/ProjectChimera/data/news'
# Define your search query
search_query = "Artificial Intelligence"

collect_google_news_data(news_api_key, search_query, news_save_path)

 Successfully collected 100 articles for 'Artificial Intelligence'.
   Saved to: /content/drive/MyDrive/ProjectChimera/data/news/Artificial_Intelligence_2025-09-08_13-31-20.json


Step 4: Data Collection from X (Twitter) API

In [4]:
import tweepy

def collect_twitter_data(bearer_token, query, folder_path):
    """
    Collects recent tweets for a given query and saves them to a timestamped JSON file.
    """
    try:
        # Initialize the client
        client = tweepy.Client(bearer_token)

        # Fetch recent tweets (free tier allows searching the last 7 days)
        response = client.search_recent_tweets(
            query=f"{query} -is:retweet",  # Search query, excluding retweets
            max_results=100,              # Max results per request (10-100)
            tweet_fields=["created_at", "public_metrics", "lang"]
        )

        if not response.data:
            print(f" No tweets found for the query: '{query}'")
            return

        # Prepare data for saving
        tweets_to_save = []
        for tweet in response.data:
            tweets_to_save.append({
                'id': tweet.id,
                'text': tweet.text,
                'created_at': str(tweet.created_at),
                'retweet_count': tweet.public_metrics['retweet_count'],
                'reply_count': tweet.public_metrics['reply_count'],
                'like_count': tweet.public_metrics['like_count'],
                'impression_count': tweet.public_metrics['impression_count']
            })

        # Create the directory in Google Drive if it doesn't exist
        os.makedirs(folder_path, exist_ok=True)

        # Create a unique, timestamped filename
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        filename = f"{query.replace(' ', '_')}_{timestamp}.json"
        filepath = os.path.join(folder_path, filename)

        # Save the data to the file
        with open(filepath, 'w') as f:
            json.dump(tweets_to_save, f, indent=4)

        print(f" Successfully collected {len(tweets_to_save)} tweets for '{query}'.")
        print(f"   Saved to: {filepath}")

    except Exception as e:
        print(f" An error occurred: {e}")

# --- Let's Run It! ---
# Define the path in your Google Drive where twitter data will be stored
twitter_save_path = '/content/drive/MyDrive/ProjectChimera/data/twitter'
# Define your search query (can be the same or different)
search_query = "Artificial Intelligence"

collect_twitter_data(twitter_bearer_token, search_query, twitter_save_path)

 An error occurred: 429 Too Many Requests
Usage cap exceeded: Monthly product cap
