In [1]:
!pip install pytrends

Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2




In [2]:
import pandas as pd

# Define NIFTY sectors with simplified, social-media-focused keywords
nifty_sectors_social_media = {
    "General": ["NIFTY", "NSE", "BSE"],
    "IT": ["NIFTY IT", "tech stocks"],
    "Banking": ["NIFTY Bank", "bank stocks", "banking sector"],
    "Pharma": ["NIFTY Pharma", "pharma stocks", "healthcare sector"],
    "FMCG": ["NIFTY FMCG", "consumer goods"],
    "Auto": ["NIFTY Auto", "auto sector"],
    "Energy": ["NIFTY Energy", "energy stocks", "renewable energy"],
    "Metals": ["NIFTY Metal", "mining stocks"],
    "Financial Services": ["NIFTY Financial", "finance stocks"],
    "Realty": ["NIFTY Realty", "real estate"],
    "Consumer Durables": ["NIFTY Consumer Durables", "consumer durables"]
}

# Focused social media and sentiment terms
social_media_terms = ["Twitter", "Reddit", "sentiment", "prediction", "volatility", "crash", "rally"]

# Function to generate all combinations
def generate_social_media_combinations(sector_keywords, social_terms):
    return [f"{term} {keyword}" for keyword in sector_keywords for term in social_terms]

# Store keywords by category
keywords_by_category = {}

for sector, sector_keywords in nifty_sectors_social_media.items():
    # Generate all social media keyword combinations
    social_media_keywords = generate_social_media_combinations(sector_keywords, social_media_terms)
    
    # Combine original sector keywords with all social media combinations
    all_keywords = sector_keywords + social_media_keywords
    
    # Store result in the dictionary
    keywords_by_category[sector] = all_keywords

# Flatten the dictionary into a single list of keywords and save to CSV
final_keywords_list = [keyword for keywords in keywords_by_category.values() for keyword in keywords]
keywords_df = pd.DataFrame(list(keywords_by_category.items()), columns=['Category', 'Keywords'])
keywords_df.to_csv('nifty_social_media_volatility_keywords_full.csv', index=False)

# Output total keywords and preview
print(f"Total number of keywords generated: {len(final_keywords_list)}")
for category, keywords in keywords_by_category.items():
    print(f"\nCategory: {category} | Keywords: {keywords}")


Total number of keywords generated: 208

Category: General | Keywords: ['NIFTY', 'NSE', 'BSE', 'Twitter NIFTY', 'Reddit NIFTY', 'sentiment NIFTY', 'prediction NIFTY', 'volatility NIFTY', 'crash NIFTY', 'rally NIFTY', 'Twitter NSE', 'Reddit NSE', 'sentiment NSE', 'prediction NSE', 'volatility NSE', 'crash NSE', 'rally NSE', 'Twitter BSE', 'Reddit BSE', 'sentiment BSE', 'prediction BSE', 'volatility BSE', 'crash BSE', 'rally BSE']

Category: IT | Keywords: ['NIFTY IT', 'tech stocks', 'Twitter NIFTY IT', 'Reddit NIFTY IT', 'sentiment NIFTY IT', 'prediction NIFTY IT', 'volatility NIFTY IT', 'crash NIFTY IT', 'rally NIFTY IT', 'Twitter tech stocks', 'Reddit tech stocks', 'sentiment tech stocks', 'prediction tech stocks', 'volatility tech stocks', 'crash tech stocks', 'rally tech stocks']

Category: Banking | Keywords: ['NIFTY Bank', 'bank stocks', 'banking sector', 'Twitter NIFTY Bank', 'Reddit NIFTY Bank', 'sentiment NIFTY Bank', 'prediction NIFTY Bank', 'volatility NIFTY Bank', 'crash NIF

In [4]:
from pytrends.request import TrendReq
import pandas as pd
import time

# Initialize pytrends
pytrends = TrendReq(hl='en-US', tz=330)

# Parameters for Google Trends
geo = 'IN'
timeframe = 'all'
category = 7  # Finance category

# Data storage
data = {}
missing_data = {}

# Loop through each category and its keywords
for category_name, keywords in keywords_by_category.items():
    for keyword in keywords:
        try:
            # Build the payload with the keyword
            pytrends.build_payload([keyword], cat=category, timeframe=timeframe, geo=geo, gprop='')

            # Get interest over time
            df = pytrends.interest_over_time()

            # Check if data is available
            if not df.empty:
                series_data = df[keyword]

                # Check if there are fewer than 5 non-zero data points
                if (series_data != 0).sum() < 5:
                    missing_data[keyword] = "Fewer than 5 non-zero data points"
                else:
                    data[keyword] = series_data  # Store time series data as a Series
            else:
                missing_data[keyword] = "No data available"
            
            # Avoid being rate-limited by Google Trends API
            time.sleep(2)  # 2-second delay between requests
        except Exception as e:
            missing_data[keyword] = str(e)

# Align all Series to a common date range (using a wide date range to ensure coverage)
all_dates = pd.date_range(start='2004-01-01', end=pd.Timestamp.today(), freq='D')
aligned_data = pd.DataFrame({k: v.reindex(all_dates, fill_value=0) for k, v in data.items()})

# Filter to keep only the first day of each month
aligned_data = aligned_data[aligned_data.index.isin(aligned_data.resample('MS').first().index)]

# Save only if there are any valid data points
if not aligned_data.empty:
    aligned_data.to_csv("nifty_social_media_trends_monthly.csv", index=True)

# Save missing data information
missing_df = pd.DataFrame(list(missing_data.items()), columns=['Keyword', 'Issue'])
missing_df.to_csv("missing_data_log.csv", index=False)

# Output total keywords collected and any missing data
print(f"Total keywords processed: {len(data)}")
print("Data collection complete. Monthly time series data saved in 'nifty_social_media_trends_monthly.csv' and missing data log in 'missing_data_log.csv'.")


  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna

Total keywords processed: 46
Data collection complete. Monthly time series data saved in 'nifty_social_media_trends_monthly.csv' and missing data log in 'missing_data_log.csv'.
