In [1]:
import requests
import json
import time
from datetime import datetime, timedelta
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from typing import Dict, List, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class AuraDataCollector:
    def __init__(self):
        # API Keys - Replace with your actual keys
        self.news_api_key = "47e7597b3e51433a81fcc3d908b1a792"
        self.weather_api_key = "5a5c9571ba254b1c90b152042243103"
        self.twitter_api_key = "984d78e1femsh2af122397b07360p15059bjsn900a6a3b90fa"
        
        # Initialize sentiment analyzer
        self.analyzer = SentimentIntensityAnalyzer()
        
        # Indian cities configuration
        self.indian_cities = {
            "Mumbai": {"lat": 19.0760, "lon": 72.8777, "keywords": ["mumbai", "bombay"]},
            "Delhi": {"lat": 28.7041, "lon": 77.1025, "keywords": ["delhi", "new delhi"]},
            "Bangalore": {"lat": 12.9716, "lon": 77.5946, "keywords": ["bangalore", "bengaluru"]},
            "Chennai": {"lat": 13.0827, "lon": 80.2707, "keywords": ["chennai", "madras"]},
            "Kolkata": {"lat": 22.5726, "lon": 88.3639, "keywords": ["kolkata", "calcutta"]},
            "Pune": {"lat": 18.5204, "lon": 73.8567, "keywords": ["pune"]},
            "Hyderabad": {"lat": 17.3850, "lon": 78.4867, "keywords": ["hyderabad"]},
            "Ahmedabad": {"lat": 23.0225, "lon": 72.5714, "keywords": ["ahmedabad"]}
        }
        
        # Initialize data storage
        self.collected_data = []

    def get_weather_data(self, city: str) -> Dict:
        """Fetch current weather data for a city"""
        try:
            city_info = self.indian_cities[city]
            url = f"http://api.weatherapi.com/v1/current.json"
            params = {
                'key': self.weather_api_key,
                'q': f"{city_info['lat']},{city_info['lon']}",
                'aqi': 'yes'
            }
            
            response = requests.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            
            # Extract relevant weather info
            weather_info = {
                'city': city,
                'temperature_c': data['current']['temp_c'],
                'condition': data['current']['condition']['text'],
                'humidity': data['current']['humidity'],
                'wind_kph': data['current']['wind_kph'],
                'air_quality_pm25': data['current'].get('air_quality', {}).get('pm2_5', 0),
                'timestamp': datetime.now().isoformat()
            }
            
            logger.info(f"Weather data collected for {city}")
            return weather_info
            
        except Exception as e:
            logger.error(f"Error fetching weather for {city}: {e}")
            return {}

    def get_news_data(self, city: str) -> List[Dict]:
        """Fetch news data related to a city"""
        try:
            news_items = []
            city_keywords = self.indian_cities[city]["keywords"]
            
            # Search for city-specific news
            for keyword in city_keywords:
                url = "https://newsapi.org/v2/everything"
                params = {
                    'apiKey': self.news_api_key,
                    'q': f"{keyword} AND india",
                    'language': 'en',
                    'sortBy': 'publishedAt',
                    'pageSize': 20,
                    'from': (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
                }
                
                response = requests.get(url, params=params)
                response.raise_for_status()
                
                data = response.json()
                
                for article in data.get('articles', []):
                    # Analyze sentiment of headline and description
                    text_to_analyze = f"{article.get('title', '')} {article.get('description', '')}"
                    sentiment_scores = self.analyzer.polarity_scores(text_to_analyze)
                    
                    news_item = {
                        'city': city,
                        'source': 'news',
                        'title': article.get('title', ''),
                        'description': article.get('description', ''),
                        'url': article.get('url', ''),
                        'published_at': article.get('publishedAt', ''),
                        'sentiment_compound': sentiment_scores['compound'],
                        'sentiment_positive': sentiment_scores['pos'],
                        'sentiment_negative': sentiment_scores['neg'],
                        'sentiment_neutral': sentiment_scores['neu'],
                        'timestamp': datetime.now().isoformat()
                    }
                    news_items.append(news_item)
                
                # Rate limiting
                time.sleep(1)
            
            logger.info(f"News data collected for {city}: {len(news_items)} articles")
            return news_items
            
        except Exception as e:
            logger.error(f"Error fetching news for {city}: {e}")
            return []

    def get_twitter_data(self, city: str) -> List[Dict]:
        """Fetch Twitter data for a city using RapidAPI"""
        try:
            tweets = []
            city_keywords = self.indian_cities[city]["keywords"]
            
            # Search for tweets related to the city
            for keyword in city_keywords:
                url = "https://twitter-api45.p.rapidapi.com/search.php"
                querystring = {
                    "query": f"{keyword} india",
                    "search_type": "Top"
                }
                
                headers = {
                    'x-rapidapi-key': self.twitter_api_key,
                    'x-rapidapi-host': "twitter-api45.p.rapidapi.com"
                }
                
                response = requests.get(url, headers=headers, params=querystring)
                response.raise_for_status()
                
                data = response.json()
                
                # Process tweets (structure may vary based on API response)
                if 'timeline' in data:
                    for tweet in data['timeline'][:20]:  # Limit to 20 tweets per keyword
                        tweet_text = tweet.get('text', '')
                        
                        # Clean tweet text
                        cleaned_text = self.clean_text(tweet_text)
                        
                        # Analyze sentiment
                        sentiment_scores = self.analyzer.polarity_scores(cleaned_text)
                        
                        tweet_item = {
                            'city': city,
                            'source': 'twitter',
                            'text': cleaned_text,
                            'user_followers': tweet.get('user', {}).get('followers_count', 0),
                            'retweet_count': tweet.get('retweet_count', 0),
                            'like_count': tweet.get('favorite_count', 0),
                            'created_at': tweet.get('created_at', ''),
                            'sentiment_compound': sentiment_scores['compound'],
                            'sentiment_positive': sentiment_scores['pos'],
                            'sentiment_negative': sentiment_scores['neg'],
                            'sentiment_neutral': sentiment_scores['neu'],
                            'timestamp': datetime.now().isoformat()
                        }
                        tweets.append(tweet_item)
                
                # Rate limiting for API calls
                time.sleep(2)
            
            logger.info(f"Twitter data collected for {city}: {len(tweets)} tweets")
            return tweets
            
        except Exception as e:
            logger.error(f"Error fetching Twitter data for {city}: {e}")
            return []

    def clean_text(self, text: str) -> str:
        """Clean text for sentiment analysis"""
        if not text:
            return ""
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove user mentions and hashtags (keep the text after #)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#(\w+)', r'\1', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text.strip()

    def calculate_city_mood(self, city_data: List[Dict]) -> Dict:
        """Calculate overall mood metrics for a city"""
        if not city_data:
            return {}
        
        sentiments = [item['sentiment_compound'] for item in city_data if 'sentiment_compound' in item]
        
        if not sentiments:
            return {}
        
        # Calculate mood metrics
        avg_sentiment = sum(sentiments) / len(sentiments)
        positive_ratio = len([s for s in sentiments if s > 0.1]) / len(sentiments)
        negative_ratio = len([s for s in sentiments if s < -0.1]) / len(sentiments)
        neutral_ratio = len([s for s in sentiments if -0.1 <= s <= 0.1]) / len(sentiments)
        
        # Determine overall mood
        if avg_sentiment > 0.3:
            mood = "Very Positive"
        elif avg_sentiment > 0.1:
            mood = "Positive" 
        elif avg_sentiment > -0.1:
            mood = "Neutral"
        elif avg_sentiment > -0.3:
            mood = "Negative"
        else:
            mood = "Very Negative"
        
        return {
            'avg_sentiment': round(avg_sentiment, 3),
            'mood_label': mood,
            'positive_ratio': round(positive_ratio, 3),
            'negative_ratio': round(negative_ratio, 3),
            'neutral_ratio': round(neutral_ratio, 3),
            'total_items': len(city_data),
            'confidence_score': min(len(city_data) / 50, 1.0)  # Confidence based on data volume
        }

    def collect_daily_data(self, cities: List[str] = None) -> Dict:
        """Collect data for specified cities for the current day"""
        if cities is None:
            cities = list(self.indian_cities.keys())[:5]  # Default to top 5 cities
        
        daily_data = {}
        
        for city in cities:
            logger.info(f"Collecting data for {city}...")
            
            city_data = []
            
            # Collect weather data
            weather_data = self.get_weather_data(city)
            if weather_data:
                city_data.append(weather_data)
            
            # Collect news data
            news_data = self.get_news_data(city)
            city_data.extend(news_data)
            
            # Collect Twitter data
            twitter_data = self.get_twitter_data(city)
            city_data.extend(twitter_data)
            
            # Calculate city mood
            mood_metrics = self.calculate_city_mood([item for item in city_data if 'sentiment_compound' in item])
            
            daily_data[city] = {
                'weather': weather_data,
                'news_items': len(news_data),
                'twitter_items': len(twitter_data),
                'mood_metrics': mood_metrics,
                'raw_data': city_data,
                'collection_timestamp': datetime.now().isoformat()
            }
            
            # Add delay between cities to respect rate limits
            time.sleep(3)
        
        return daily_data

    def save_data(self, data: Dict, filename: str = None):
        """Save collected data to JSON file"""
        if filename is None:
            filename = f"aura_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        
        logger.info(f"Data saved to {filename}")

    def generate_summary_report(self, data: Dict) -> str:
        """Generate a human-readable summary of collected data"""
        report = []
        report.append("=== AURA.AI DAILY MOOD REPORT ===")
        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append("")
        
        for city, city_data in data.items():
            mood = city_data.get('mood_metrics', {})
            weather = city_data.get('weather', {})
            
            report.append(f"🏙️ {city.upper()}")
            report.append(f"   Mood: {mood.get('mood_label', 'Unknown')} ({mood.get('avg_sentiment', 0):.3f})")
            report.append(f"   Weather: {weather.get('condition', 'N/A')} - {weather.get('temperature_c', 'N/A')}°C")
            report.append(f"   Data Points: {city_data.get('news_items', 0)} news + {city_data.get('twitter_items', 0)} tweets")
            report.append(f"   Confidence: {mood.get('confidence_score', 0):.0%}")
            report.append("")
        
        return "\n".join(report)

# Example usage
if __name__ == "__main__":
    # Initialize collector
    collector = AuraDataCollector()
    
    # Collect data for top 3 cities (to start small)
    cities_to_collect = ["Mumbai", "Delhi", "Bangalore"]
    
    print("Starting Aura.AI data collection...")
    daily_data = collector.collect_daily_data(cities_to_collect)
    
    # Save data
    collector.save_data(daily_data)
    
    # Generate and print summary
    summary = collector.generate_summary_report(daily_data)
    print(summary)
    
    # Save summary
    with open(f"daily_summary_{datetime.now().strftime('%Y%m%d')}.txt", 'w') as f:
        f.write(summary)

INFO:__main__:Collecting data for Mumbai...


Starting Aura.AI data collection...


INFO:__main__:Weather data collected for Mumbai
INFO:__main__:News data collected for Mumbai: 34 articles
INFO:__main__:Twitter data collected for Mumbai: 35 tweets
INFO:__main__:Collecting data for Delhi...
INFO:__main__:Weather data collected for Delhi
INFO:__main__:News data collected for Delhi: 40 articles
INFO:__main__:Twitter data collected for Delhi: 32 tweets
INFO:__main__:Collecting data for Bangalore...
INFO:__main__:Weather data collected for Bangalore
INFO:__main__:News data collected for Bangalore: 27 articles
INFO:__main__:Twitter data collected for Bangalore: 35 tweets
INFO:__main__:Data saved to aura_data_20250813_234214.json


=== AURA.AI DAILY MOOD REPORT ===
Generated: 2025-08-13 23:42:14

🏙️ MUMBAI
   Mood: Very Positive (0.312)
   Weather: Mist - 26.2°C
   Data Points: 34 news + 35 tweets
   Confidence: 100%

🏙️ DELHI
   Mood: Positive (0.212)
   Weather: Mist - 31.3°C
   Data Points: 40 news + 32 tweets
   Confidence: 100%

🏙️ BANGALORE
   Mood: Positive (0.239)
   Weather: Partly cloudy - 21.4°C
   Data Points: 27 news + 35 tweets
   Confidence: 100%



UnicodeEncodeError: 'charmap' codec can't encode characters in position 69-70: character maps to <undefined>

In [2]:
import requests
import json
import time
from datetime import datetime, timedelta
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from typing import Dict, List, Optional
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class AuraDataCollector:
    def __init__(self):
        # API Keys - Replace with your actual keys
        self.news_api_key = "47e7597b3e51433a81fcc3d908b1a792"
        self.weather_api_key = "5a5c9571ba254b1c90b152042243103"
        self.twitter_api_key = "984d78e1femsh2af122397b07360p15059bjsn900a6a3b90fa"
        
        # Initialize sentiment analyzer
        self.analyzer = SentimentIntensityAnalyzer()
        
        # Top 5 Indian cities for focused data collection
        self.indian_cities = {
            "Mumbai": {
                "lat": 19.0760, "lon": 72.8777, 
                "keywords": ["mumbai", "bombay", "maharashtra"],
                "population": 12442373,
                "region": "Western India"
            },
            "Delhi": {
                "lat": 28.7041, "lon": 77.1025, 
                "keywords": ["delhi", "new delhi", "ncr"],
                "population": 11007835,
                "region": "Northern India"
            },
            "Bangalore": {
                "lat": 12.9716, "lon": 77.5946, 
                "keywords": ["bangalore", "bengaluru", "karnataka"],
                "population": 8443675,
                "region": "Southern India"
            },
            "Chennai": {
                "lat": 13.0827, "lon": 80.2707, 
                "keywords": ["chennai", "madras", "tamil nadu"],
                "population": 4646732,
                "region": "Southern India"
            },
            "Kolkata": {
                "lat": 22.5726, "lon": 88.3639, 
                "keywords": ["kolkata", "calcutta", "west bengal"],
                "population": 4496694,
                "region": "Eastern India"
            }
        }
        
        # Initialize data storage
        self.collected_data = []

    def get_weather_data(self, city: str) -> Dict:
        """Fetch current weather data for a city"""
        try:
            city_info = self.indian_cities[city]
            url = f"http://api.weatherapi.com/v1/current.json"
            params = {
                'key': self.weather_api_key,
                'q': f"{city_info['lat']},{city_info['lon']}",
                'aqi': 'yes'
            }
            
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            
            data = response.json()
            
            # Extract relevant weather info
            weather_info = {
                'city': city,
                'temperature_c': data['current']['temp_c'],
                'condition': data['current']['condition']['text'],
                'humidity': data['current']['humidity'],
                'wind_kph': data['current']['wind_kph'],
                'feels_like_c': data['current']['feelslike_c'],
                'uv_index': data['current']['uv'],
                'air_quality_pm25': data['current'].get('air_quality', {}).get('pm2_5', 0),
                'air_quality_pm10': data['current'].get('air_quality', {}).get('pm10', 0),
                'timestamp': datetime.now().isoformat(),
                'data_source': 'weather_api'
            }
            
            logger.info(f"✅ Weather data collected for {city}: {weather_info['condition']}, {weather_info['temperature_c']}°C")
            return weather_info
            
        except requests.exceptions.RequestException as e:
            logger.error(f"❌ Network error fetching weather for {city}: {e}")
            return {}
        except Exception as e:
            logger.error(f"❌ Error fetching weather for {city}: {e}")
            return {}

    def get_news_data(self, city: str) -> List[Dict]:
        """Fetch news data related to a city"""
        try:
            news_items = []
            city_keywords = self.indian_cities[city]["keywords"]
            
            # Search for city-specific news
            for keyword in city_keywords:
                url = "https://newsapi.org/v2/everything"
                params = {
                    'apiKey': self.news_api_key,
                    'q': f"{keyword} AND india",
                    'language': 'en',
                    'sortBy': 'publishedAt',
                    'pageSize': 15,  # Reduced to stay within limits
                    'from': (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
                }
                
                response = requests.get(url, params=params, timeout=15)
                response.raise_for_status()
                
                data = response.json()
                
                for article in data.get('articles', []):
                    # Skip articles with null/empty content
                    title = article.get('title', '') or ''
                    description = article.get('description', '') or ''
                    
                    if not title and not description:
                        continue
                    
                    # Analyze sentiment of headline and description
                    text_to_analyze = f"{title} {description}"
                    sentiment_scores = self.analyzer.polarity_scores(text_to_analyze)
                    
                    news_item = {
                        'city': city,
                        'source': 'news',
                        'title': title,
                        'description': description,
                        'url': article.get('url', ''),
                        'source_name': article.get('source', {}).get('name', ''),
                        'published_at': article.get('publishedAt', ''),
                        'sentiment_compound': sentiment_scores['compound'],
                        'sentiment_positive': sentiment_scores['pos'],
                        'sentiment_negative': sentiment_scores['neg'],
                        'sentiment_neutral': sentiment_scores['neu'],
                        'timestamp': datetime.now().isoformat(),
                        'data_source': 'news_api'
                    }
                    news_items.append(news_item)
                
                # Rate limiting - respect API limits
                time.sleep(2)
            
            logger.info(f"✅ News data collected for {city}: {len(news_items)} articles")
            return news_items
            
        except requests.exceptions.RequestException as e:
            logger.error(f"❌ Network error fetching news for {city}: {e}")
            return []
        except Exception as e:
            logger.error(f"❌ Error fetching news for {city}: {e}")
            return []

    def get_twitter_data(self, city: str) -> List[Dict]:
        """Fetch Twitter data for a city using RapidAPI"""
        try:
            tweets = []
            city_keywords = self.indian_cities[city]["keywords"]
            
            # Search for tweets related to the city
            for keyword in city_keywords[:2]:  # Limit to 2 keywords to reduce API calls
                url = "https://twitter-api45.p.rapidapi.com/search.php"
                querystring = {
                    "query": f"{keyword} india mood weather life",
                    "search_type": "Top"
                }
                
                headers = {
                    'x-rapidapi-key': self.twitter_api_key,
                    'x-rapidapi-host': "twitter-api45.p.rapidapi.com"
                }
                
                response = requests.get(url, headers=headers, params=querystring, timeout=15)
                response.raise_for_status()
                
                data = response.json()
                
                # Process tweets (structure may vary based on API response)
                if isinstance(data, dict) and 'timeline' in data:
                    for tweet in data['timeline'][:10]:  # Limit to 10 tweets per keyword
                        tweet_text = tweet.get('text', '') or ''
                        
                        if not tweet_text:
                            continue
                        
                        # Clean tweet text
                        cleaned_text = self.clean_text(tweet_text)
                        
                        if len(cleaned_text) < 10:  # Skip very short tweets
                            continue
                        
                        # Analyze sentiment
                        sentiment_scores = self.analyzer.polarity_scores(cleaned_text)
                        
                        tweet_item = {
                            'city': city,
                            'source': 'twitter',
                            'text': cleaned_text,
                            'user_followers': tweet.get('user', {}).get('followers_count', 0) if tweet.get('user') else 0,
                            'retweet_count': tweet.get('retweet_count', 0),
                            'like_count': tweet.get('favorite_count', 0),
                            'created_at': tweet.get('created_at', ''),
                            'sentiment_compound': sentiment_scores['compound'],
                            'sentiment_positive': sentiment_scores['pos'],
                            'sentiment_negative': sentiment_scores['neg'],
                            'sentiment_neutral': sentiment_scores['neu'],
                            'timestamp': datetime.now().isoformat(),
                            'data_source': 'twitter_api'
                        }
                        tweets.append(tweet_item)
                
                # Rate limiting for API calls
                time.sleep(3)
            
            logger.info(f"✅ Twitter data collected for {city}: {len(tweets)} tweets")
            return tweets
            
        except requests.exceptions.RequestException as e:
            logger.error(f"❌ Network error fetching Twitter data for {city}: {e}")
            return []
        except Exception as e:
            logger.error(f"❌ Error fetching Twitter data for {city}: {e}")
            return []

    def clean_text(self, text: str) -> str:
        """Clean text for sentiment analysis"""
        if not text:
            return ""
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove user mentions and hashtags (keep the text after #)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#(\w+)', r'\1', text)
        
        # Remove extra whitespace and special characters
        text = re.sub(r'\n+', ' ', text)
        text = ' '.join(text.split())
        
        return text.strip()

    def calculate_city_mood(self, city_data: List[Dict]) -> Dict:
        """Calculate overall mood metrics for a city"""
        if not city_data:
            return {
                'avg_sentiment': 0,
                'mood_label': 'No Data',
                'positive_ratio': 0,
                'negative_ratio': 0,
                'neutral_ratio': 0,
                'total_items': 0,
                'confidence_score': 0
            }
        
        sentiments = [item['sentiment_compound'] for item in city_data if 'sentiment_compound' in item]
        
        if not sentiments:
            return {
                'avg_sentiment': 0,
                'mood_label': 'No Sentiment Data',
                'positive_ratio': 0,
                'negative_ratio': 0,
                'neutral_ratio': 0,
                'total_items': len(city_data),
                'confidence_score': 0
            }
        
        # Calculate mood metrics
        avg_sentiment = sum(sentiments) / len(sentiments)
        positive_ratio = len([s for s in sentiments if s > 0.1]) / len(sentiments)
        negative_ratio = len([s for s in sentiments if s < -0.1]) / len(sentiments)
        neutral_ratio = len([s for s in sentiments if -0.1 <= s <= 0.1]) / len(sentiments)
        
        # Determine overall mood
        if avg_sentiment > 0.3:
            mood = "Very Positive"
        elif avg_sentiment > 0.1:
            mood = "Positive" 
        elif avg_sentiment > -0.1:
            mood = "Neutral"
        elif avg_sentiment > -0.3:
            mood = "Negative"
        else:
            mood = "Very Negative"
        
        return {
            'avg_sentiment': round(avg_sentiment, 3),
            'mood_label': mood,
            'positive_ratio': round(positive_ratio, 3),
            'negative_ratio': round(negative_ratio, 3),
            'neutral_ratio': round(neutral_ratio, 3),
            'total_items': len(city_data),
            'confidence_score': round(min(len(sentiments) / 30, 1.0), 2)  # Confidence based on data volume
        }

    def collect_daily_data(self, cities: List[str] = None) -> Dict:
        """Collect data for all 5 major Indian cities"""
        if cities is None:
            cities = list(self.indian_cities.keys())  # All 5 cities
        
        daily_data = {}
        total_start_time = time.time()
        
        logger.info(f"🚀 Starting data collection for {len(cities)} cities: {', '.join(cities)}")
        
        for i, city in enumerate(cities, 1):
            city_start_time = time.time()
            logger.info(f"📍 [{i}/{len(cities)}] Collecting data for {city}...")
            
            city_data = []
            
            # Collect weather data
            weather_data = self.get_weather_data(city)
            if weather_data:
                city_data.append(weather_data)
            
            # Collect news data
            news_data = self.get_news_data(city)
            city_data.extend(news_data)
            
            # Collect Twitter data
            twitter_data = self.get_twitter_data(city)
            city_data.extend(twitter_data)
            
            # Calculate city mood
            mood_metrics = self.calculate_city_mood([item for item in city_data if 'sentiment_compound' in item])
            
            city_elapsed = time.time() - city_start_time
            
            daily_data[city] = {
                'city_info': self.indian_cities[city],
                'weather': weather_data,
                'news_count': len(news_data),
                'twitter_count': len(twitter_data),
                'mood_metrics': mood_metrics,
                'raw_data': city_data,
                'collection_time_seconds': round(city_elapsed, 2),
                'collection_timestamp': datetime.now().isoformat()
            }
            
            logger.info(f"✅ {city} completed in {city_elapsed:.1f}s - Mood: {mood_metrics['mood_label']} ({mood_metrics['avg_sentiment']})")
            
            # Add delay between cities to respect rate limits
            if i < len(cities):  # Don't wait after the last city
                logger.info("⏳ Waiting 5 seconds before next city...")
                time.sleep(5)
        
        total_elapsed = time.time() - total_start_time
        logger.info(f"🎉 Data collection completed for all cities in {total_elapsed:.1f}s")
        
        return daily_data

    def save_data(self, data: Dict, filename: str = None):
        """Save collected data to JSON file with proper UTF-8 encoding"""
        if filename is None:
            filename = f"aura_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        
        try:
            # Ensure directory exists
            Path(filename).parent.mkdir(parents=True, exist_ok=True)
            
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            
            logger.info(f"💾 Data saved to {filename}")
            return filename
        except Exception as e:
            logger.error(f"❌ Error saving data: {e}")
            return None

    def save_csv_summary(self, data: Dict, filename: str = None):
        """Save a CSV summary of the collected data"""
        if filename is None:
            filename = f"aura_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        
        try:
            summary_data = []
            for city, city_data in data.items():
                mood = city_data.get('mood_metrics', {})
                weather = city_data.get('weather', {})
                
                summary_data.append({
                    'city': city,
                    'region': city_data.get('city_info', {}).get('region', ''),
                    'mood_label': mood.get('mood_label', ''),
                    'avg_sentiment': mood.get('avg_sentiment', 0),
                    'positive_ratio': mood.get('positive_ratio', 0),
                    'negative_ratio': mood.get('negative_ratio', 0),
                    'temperature_c': weather.get('temperature_c', 0),
                    'weather_condition': weather.get('condition', ''),
                    'humidity': weather.get('humidity', 0),
                    'air_quality_pm25': weather.get('air_quality_pm25', 0),
                    'news_count': city_data.get('news_count', 0),
                    'twitter_count': city_data.get('twitter_count', 0),
                    'confidence_score': mood.get('confidence_score', 0),
                    'collection_time': city_data.get('collection_timestamp', '')
                })
            
            df = pd.DataFrame(summary_data)
            df.to_csv(filename, index=False, encoding='utf-8')
            logger.info(f"📊 CSV summary saved to {filename}")
            return filename
        except Exception as e:
            logger.error(f"❌ Error saving CSV: {e}")
            return None

    def generate_summary_report(self, data: Dict) -> str:
        """Generate a human-readable summary of collected data"""
        report = []
        report.append("=" * 50)
        report.append("    AURA.AI DAILY MOOD REPORT - INDIA")
        report.append("=" * 50)
        report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S IST')}")
        report.append("")
        
        # Overall statistics
        total_news = sum(city_data.get('news_count', 0) for city_data in data.values())
        total_tweets = sum(city_data.get('twitter_count', 0) for city_data in data.values())
        
        report.append(f"📊 OVERALL STATISTICS:")
        report.append(f"   Total Cities Analyzed: {len(data)}")
        report.append(f"   Total News Articles: {total_news}")
        report.append(f"   Total Tweets: {total_tweets}")
        report.append("")
        
        # City-wise breakdown
        report.append("🏙️ CITY-WISE MOOD BREAKDOWN:")
        report.append("-" * 50)
        
        for city, city_data in data.items():
            mood = city_data.get('mood_metrics', {})
            weather = city_data.get('weather', {})
            
            report.append(f"📍 {city.upper()} ({city_data.get('city_info', {}).get('region', '')})")
            report.append(f"   Mood: {mood.get('mood_label', 'Unknown')} (Score: {mood.get('avg_sentiment', 0):.3f})")
            report.append(f"   Weather: {weather.get('condition', 'N/A')} - {weather.get('temperature_c', 'N/A')}°C")
            report.append(f"   Feels Like: {weather.get('feels_like_c', 'N/A')}°C | Humidity: {weather.get('humidity', 'N/A')}%")
            report.append(f"   Air Quality PM2.5: {weather.get('air_quality_pm25', 'N/A')}")
            report.append(f"   Data: {city_data.get('news_count', 0)} news + {city_data.get('twitter_count', 0)} tweets")
            report.append(f"   Confidence: {mood.get('confidence_score', 0):.0%}")
            report.append("")
        
        return "\n".join(report)

# Example usage and main execution
if __name__ == "__main__":
    # Initialize collector
    collector = AuraDataCollector()
    
    print("🚀 Starting Aura.AI Data Collection for Top 5 Indian Cities")
    print("=" * 60)
    
    try:
        # Collect data for all 5 cities
        daily_data = collector.collect_daily_data()
        
        # Save data in multiple formats
        json_file = collector.save_data(daily_data)
        csv_file = collector.save_csv_summary(daily_data)
        
        # Generate and print summary
        summary = collector.generate_summary_report(daily_data)
        print("\n" + summary)
        
        # Save summary with UTF-8 encoding
        summary_file = f"daily_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        try:
            with open(summary_file, 'w', encoding='utf-8') as f:
                f.write(summary)
            print(f"📄 Summary report saved to: {summary_file}")
        except Exception as e:
            print(f"❌ Error saving summary: {e}")
        
        print(f"\n✅ Collection complete! Files generated:")
        if json_file:
            print(f"   📁 JSON Data: {json_file}")
        if csv_file:
            print(f"   📊 CSV Summary: {csv_file}")
        print(f"   📄 Text Summary: {summary_file}")
        
    except KeyboardInterrupt:
        print("\n⏹️  Data collection interrupted by user")
    except Exception as e:
        print(f"\n❌ Error during data collection: {e}")
        logger.error(f"Fatal error: {e}", exc_info=True)

2025-09-04 23:52:01,320 - INFO - 🚀 Starting data collection for 5 cities: Mumbai, Delhi, Bangalore, Chennai, Kolkata
2025-09-04 23:52:01,321 - INFO - 📍 [1/5] Collecting data for Mumbai...


🚀 Starting Aura.AI Data Collection for Top 5 Indian Cities


2025-09-04 23:52:01,655 - INFO - ✅ Weather data collected for Mumbai: Mist, 27.2°C
2025-09-04 23:52:02,638 - ERROR - ❌ Network error fetching news for Mumbai: 429 Client Error: Too Many Requests for url: https://newsapi.org/v2/everything?apiKey=47e7597b3e51433a81fcc3d908b1a792&q=mumbai+AND+india&language=en&sortBy=publishedAt&pageSize=15&from=2025-09-03



⏹️  Data collection interrupted by user


In [2]:
import requests
import json

def test_newsdata_io_debug():
    """
    Debug function to test NewsData.io API and identify the 422 error.
    """
    # Your NewsData.io API Key 
    API_KEY = "pub_1e11c85ce5a947b881f961784d8f3ea0"  # Double-check this key!
    
    print("🔍 Debugging NewsData.io API Request")
    print("="*50)
    
    # Test with minimal parameters first
    url = "https://newsdata.io/api/1/latest"
    params = {
        'apikey': API_KEY,
        'q': 'india',  # Simple query first
        'size': 3
    }
    
    print(f"API Key: {API_KEY}")
    print(f"Request URL: {url}")
    print(f"Params: {params}")
    print("-"*30)
    
    try:
        print("📡 Making API request...")
        response = requests.get(url, params=params, timeout=15)
        
        print(f"Status Code: {response.status_code}")
        
        # Try to get the error details
        try:
            error_data = response.json()
            print(f"Error Response: {json.dumps(error_data, indent=2)}")
            
            if 'message' in error_data:
                print(f"🔴 Error Message: {error_data['message']}")
            if 'code' in error_data:
                print(f"🔴 Error Code: {error_data['code']}")
                
        except json.JSONDecodeError:
            print(f"Raw Response: {response.text}")
            
    except requests.exceptions.RequestException as e:
        print(f"❌ Network error: {e}")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

# Let's also test just the API key validity
def test_api_key_only():
    """
    Test if the API key is valid by making a simple request
    """
    API_KEY = "pub_1e11c85ce5a947b881f961784d8f3ea0"
    
    url = "https://newsdata.io/api/1/latest"
    params = {'apikey': API_KEY, 'q': 'test'}
    
    print("\n🔑 Testing API Key Validity")
    print("="*30)
    
    try:
        response = requests.get(url, params=params, timeout=10)
        print(f"Status Code: {response.status_code}")
        
        if response.status_code == 200:
            data = response.json()
            print(f"✅ API Key appears valid! Status: {data.get('status')}")
            print(f"Total Results: {data.get('totalResults', 0)}")
        else:
            print(f"❌ API Key may be invalid or expired")
            print(f"Response: {response.text}")
            
    except Exception as e:
        print(f"Error testing API key: {e}")

# Run both tests
if __name__ == "__main__":
    test_api_key_only()
    test_newsdata_io_debug()


🔑 Testing API Key Validity
Status Code: 200
✅ API Key appears valid! Status: success
Total Results: 19696
🔍 Debugging NewsData.io API Request
API Key: pub_1e11c85ce5a947b881f961784d8f3ea0
Request URL: https://newsdata.io/api/1/latest
Params: {'apikey': 'pub_1e11c85ce5a947b881f961784d8f3ea0', 'q': 'india', 'size': 3}
------------------------------
📡 Making API request...
Status Code: 200
Error Response: {
  "status": "success",
  "totalResults": 21643,
  "results": [
    {
      "article_id": "9c39b731ef93c1120cfb683d8ace039c",
      "title": "VB arrests man on bribe charge",
      "link": "https://timesofindia.indiatimes.com/city/ludhiana/vb-arrests-man-on-bribe-charge/articleshow/123705230.cms",
      "keywords": [
        "rajat sharma",
        "arrest",
        "punjab vigilance bureau",
        "bribe charge",
        "anti-corruption"
      ],
      "creator": [
        "Text Size"
      ],
      "description": null,
      "content": "ONLY AVAILABLE IN PAID PLANS",
      "pubDat

In [3]:
import requests
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def test_newsdata_io_working():
    """
    Working NewsData.io API test with sentiment analysis
    """
    API_KEY = "pub_1e11c85ce5a947b881f961784d8f3ea0"  # Your working key
    city = "Mumbai"
    keyword = "mumbai"
    
    analyzer = SentimentIntensityAnalyzer()
    
    print(f"🧪 Testing NewsData.io for {city} with keyword: '{keyword}'")
    print("="*60)
    
    try:
        url = "https://newsdata.io/api/1/latest"
        params = {
            'apikey': API_KEY,
            'q': keyword,
            'language': 'en',
            'size': 5  # Get 5 articles
        }

        print("📡 Making API request...")
        response = requests.get(url, params=params, timeout=15)
        response.raise_for_status()

        data = response.json()
        
        if data.get('status') != 'success':
            print(f"❌ API Error: {data.get('message')}")
            return
        
        articles = data.get('results', [])
        print(f"✅ Found {len(articles)} articles for '{keyword}'")
        print("="*60)
        
        for i, article in enumerate(articles, 1):
            title = article.get('title', 'No Title')
            description = article.get('description', '') or ''
            
            # Analyze sentiment
            text_to_analyze = f"{title} {description}"
            sentiment_scores = analyzer.polarity_scores(text_to_analyze)
            
            # Determine sentiment label
            compound_score = sentiment_scores['compound']
            if compound_score >= 0.05:
                sentiment_label = "👍 POSITIVE"
            elif compound_score <= -0.05:
                sentiment_label = "👎 NEGATIVE"
            else:
                sentiment_label = "😐 NEUTRAL"
            
            print(f"\n📰 Article #{i}:")
            print(f"   Title: {title}")
            print(f"   Source: {article.get('source_id', 'Unknown')}")
            print(f"   Published: {article.get('pubDate', 'Unknown')}")
            print(f"   Sentiment: {sentiment_label} ({compound_score:.3f})")
            if description:
                print(f"   Description: {description[:100]}...")
            print(f"   URL: {article.get('link', 'No URL')}")
            print("-" * 50)

    except requests.exceptions.RequestException as e:
        print(f"❌ Network error: {e}")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

# Run the working test
if __name__ == "__main__":
    test_newsdata_io_working()

🧪 Testing NewsData.io for Mumbai with keyword: 'mumbai'
📡 Making API request...
✅ Found 5 articles for 'mumbai'

📰 Article #1:
   Title: ‘Bigg Boss 19’: Housemates Plan Sweetest Surprise for Birthday Girl Neelam Giri, Contestants Dance Together on Bhojpuri Song (Watch Videos)
   Source: latestly
   Published: 2025-09-04 18:27:37
   Sentiment: 👍 POSITIVE (0.921)
   Description: Big Boss contestants of the house planned a beautiful surprise for Neelam Giri and were seen dancing...
   URL: https://www.latestly.com/entertainment/tv/bigg-boss-19-housemates-plan-sweetest-surprise-for-birthday-girl-neelam-giri-contestants-dance-together-on-bhojpuri-song-watch-videos-7094505.html
--------------------------------------------------

📰 Article #2:
   Title: Entertainment News | Kartik Aaryan Announces Wrap of 'Tu Meri Main Tera Main Tera Tu Meri'
   Source: latestly
   Published: 2025-09-04 18:26:23
   Sentiment: 👍 POSITIVE (0.681)
   Description: Get latest articles and stories on Entertainment 