In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.express as px
import time
from pytrends.request import TrendReq

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import plotly.offline as pyo

pyo.init_notebook_mode(connected=True)  # Initialize Plotly in Jupyter Notebook

import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for clean output


In [3]:
import pandas as pd
import time
from pytrends.request import TrendReq
import random

# Set pandas option to avoid warnings
pd.set_option('future.no_silent_downcasting', True)

def get_trends_data(keywords, timeframe='now 7-d', geo='', retries=3, backoff_factor=1.5):
    """
    Fetch Google Trends data for given keywords and timeframe with error handling and backoff.
    
    Parameters:
    keywords (list): A list of keywords to query.
    timeframe (str): The time range for the trends data.
    geo (str): Geographic location (default is global).
    retries (int): Number of retries if a request fails.
    backoff_factor (float): Factor to increase wait time between retries.
    
    Returns:
    pd.DataFrame: Combined trends data for all keywords.
    """
    pytrends = TrendReq(hl='en-US', tz=360)
    all_data = pd.DataFrame()
    
    # Process keywords in batches of 5 (Google Trends limit)
    for i in range(0, len(keywords), 5):
        keyword_batch = keywords[i:i+5]
        batch_success = False
        attempt = 0
        wait_time = 60  # Initial wait time in seconds
        
        print(f"Processing batch {i//5 + 1}/{(len(keywords)-1)//5 + 1}: {keyword_batch}")
        
        # Retry logic
        while not batch_success and attempt < retries:
            try:
                pytrends.build_payload(keyword_batch, timeframe=timeframe, geo=geo)
                data = pytrends.interest_over_time()
                
                if not data.empty:
                    # Drop isPartial column if it exists
                    if 'isPartial' in data.columns:
                        data = data.drop('isPartial', axis=1)
                    
                    # If this is our first successful batch
                    if all_data.empty:
                        all_data = data
                    else:
                        # Merge with existing data on date index
                        all_data = pd.merge(all_data, data, left_index=True, right_index=True, how='outer')
                    
                    batch_success = True
                    print(f"✓ Successfully retrieved data for: {keyword_batch}")
                else:
                    print(f"✗ No data returned for: {keyword_batch}")
                    attempt += 1
                    
            except Exception as e:
                attempt += 1
                print(f"✗ Error on attempt {attempt}/{retries} for {keyword_batch}: {str(e)}")
                
            # If we need to retry, wait with exponential backoff plus some randomization
            if not batch_success and attempt < retries:
                wait_time = wait_time * backoff_factor * (1 + random.random() * 0.5)
                print(f"  Waiting {wait_time:.1f} seconds before retry...")
                time.sleep(wait_time)
            else:
                # Always wait between successful batches to avoid rate limiting
                jitter = random.random() * 10 + 60  # Between 60-70 seconds
                print(f"  Waiting {jitter:.1f} seconds before next batch...")
                time.sleep(jitter)
    
    # Fill NaN values with 0 (for keywords that weren't in all batches)
    all_data = all_data.fillna(0)
    
    return all_data

# Example usage
if __name__ == "__main__":
    keywords = [
        'digital marketing', 'social media', 'content marketing', 'SEO', 'PPC', 
        'email marketing', 'influencer marketing', 'ecommerce', 'video marketing', 
        'affiliate marketing', 'online advertising', 'SEM', 'Google Ads', 'Facebook Ads', 
        'content creation', 'brand strategy', 'customer engagement', 'market research', 
        'analytics', 'data-driven marketing', 'user experience'
    ]
    
    # You can specify a geographical region if needed
    timeframe = '2025-01-01 2025-03-31'
    geo = ''  # Empty for worldwide, 'US' for United States, etc.
    
    trends_df = get_trends_data(keywords, timeframe, geo)
    
    # Save the data to CSV
    trends_df.to_csv('digital_marketing_trends_q1_2025.csv')
    print(f"Data collection complete! DataFrame shape: {trends_df.shape}")

Processing batch 1/5: ['digital marketing', 'social media', 'content marketing', 'SEO', 'PPC']
✓ Successfully retrieved data for: ['digital marketing', 'social media', 'content marketing', 'SEO', 'PPC']
  Waiting 62.0 seconds before next batch...
Processing batch 2/5: ['email marketing', 'influencer marketing', 'ecommerce', 'video marketing', 'affiliate marketing']
✓ Successfully retrieved data for: ['email marketing', 'influencer marketing', 'ecommerce', 'video marketing', 'affiliate marketing']
  Waiting 69.8 seconds before next batch...
Processing batch 3/5: ['online advertising', 'SEM', 'Google Ads', 'Facebook Ads', 'content creation']
✗ Error on attempt 1/3 for ['online advertising', 'SEM', 'Google Ads', 'Facebook Ads', 'content creation']: The request failed: Google returned a response with code 429
  Waiting 125.4 seconds before retry...
✓ Successfully retrieved data for: ['online advertising', 'SEM', 'Google Ads', 'Facebook Ads', 'content creation']
  Waiting 66.1 seconds befor

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import MaxNLocator
import matplotlib.dates as mdates

def visualize_trends_data(trends_df, output_folder="trends_visualizations"):
    """
    Create comprehensive visualizations for Google Trends data.
    
    Parameters:
    trends_df (pd.DataFrame): DataFrame containing Google Trends data
    output_folder (str): Folder to save visualization images
    """
    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Set the style
    sns.set(style="whitegrid")
    plt.rcParams.update({'font.size': 12, 'figure.figsize': (14, 8)})
    
    # 1. Line plot of all trends over time
    plt.figure(figsize=(16, 10))
    plt.title('Google Trends for Digital Marketing Keywords (Q1 2025)', fontsize=16)
    
    # Plot each trend line
    for column in trends_df.columns:
        plt.plot(trends_df.index, trends_df[column], linewidth=2, label=column)
    
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xlabel('Date')
    plt.ylabel('Search Interest')
    plt.grid(True, alpha=0.3)
    
    # Format x-axis to show dates nicely
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig(f"{output_folder}/all_trends_line_plot.png", dpi=300)
    plt.close()
    
    # 2. Heatmap of correlation between trends
    plt.figure(figsize=(14, 12))
    correlation = trends_df.corr()
    mask = np.triu(np.ones_like(correlation, dtype=bool))
    
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    sns.heatmap(correlation, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
                square=True, linewidths=.5, annot=True, fmt=".2f", cbar_kws={"shrink": .8})
    
    plt.title('Correlation Between Digital Marketing Trends', fontsize=16)
    plt.tight_layout()
    plt.savefig(f"{output_folder}/correlation_heatmap.png", dpi=300)
    plt.close()
    
    # 3. Top 5 trending topics over time
    top_5_keywords = trends_df.mean().nlargest(5).index
    plt.figure(figsize=(14, 8))
    
    for keyword in top_5_keywords:
        plt.plot(trends_df.index, trends_df[keyword], linewidth=3, label=keyword)
    
    plt.title('Top 5 Digital Marketing Trends (Q1 2025)', fontsize=16)
    plt.xlabel('Date')
    plt.ylabel('Search Interest')
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    
    # Format x-axis
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig(f"{output_folder}/top_5_trends.png", dpi=300)
    plt.close()
    
    # 4. Seasonal decomposition for top trend
    from statsmodels.tsa.seasonal import seasonal_decompose
    
    top_trend = trends_df.mean().idxmax()
    plt.figure(figsize=(14, 12))
    
    try:
        result = seasonal_decompose(trends_df[top_trend], model='additive', period=7)
        result.plot()
        plt.suptitle(f'Seasonal Decomposition of "{top_trend}" Trend', fontsize=16)
        plt.tight_layout()
        plt.subplots_adjust(top=0.9)
        plt.savefig(f"{output_folder}/seasonal_decomposition.png", dpi=300)
        plt.close()
    except:
        print("Could not perform seasonal decomposition - may need more data points")
    
    # 5. Bar chart of average interest over the period
    avg_interest = trends_df.mean().sort_values(ascending=False)
    
    plt.figure(figsize=(16, 10))
    bars = plt.bar(avg_interest.index, avg_interest.values, color=sns.color_palette("viridis", len(avg_interest)))
    
    plt.title('Average Search Interest by Keyword (Q1 2025)', fontsize=16)
    plt.xlabel('Keyword')
    plt.ylabel('Average Interest')
    plt.xticks(rotation=90)
    plt.grid(axis='y', alpha=0.3)
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{height:.1f}', ha='center', va='bottom', rotation=0)
    
    plt.tight_layout()
    plt.savefig(f"{output_folder}/average_interest_bar.png", dpi=300)
    plt.close()
    
    print(f"✓ All visualizations saved to {output_folder}/")

# Usage example
if __name__ == "__main__":
    # If you've saved the data
    try:
        trends_df = pd.read_csv('digital_marketing_trends_q1_2025.csv', index_col=0, parse_dates=True)
        visualize_trends_data(trends_df)
    except FileNotFoundError:
        print("Please run the data collection script first to generate the CSV file.")

✓ All visualizations saved to trends_visualizations/


<Figure size 1400x1200 with 0 Axes>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_marketing_trends(trends_df):
    """
    Perform simplified advanced analysis on Google Trends data for marketing insights.
    Removed dependencies on sklearn for clustering and PCA.
    
    Parameters:
    trends_df (pd.DataFrame): DataFrame containing Google Trends data
    
    Returns:
    dict: Dictionary containing various analysis results
    """
    results = {}
    
    # 1. Basic statistics
    results['basic_stats'] = {
        'highest_avg_interest': trends_df.mean().idxmax(),
        'lowest_avg_interest': trends_df.mean().idxmin(),
        'most_volatile': trends_df.std().idxmax(),
        'most_stable': trends_df.std().idxmin(),
    }
    
    # 2. Identify trending topics (keywords with positive slope)
    # Calculate the slope of a linear fit for each keyword
    slopes = {}
    for column in trends_df.columns:
        y = trends_df[column].values
        x = np.arange(len(y))
        slope, _ = np.polyfit(x, y, 1)
        slopes[column] = slope
    
    trending_up = {k: v for k, v in slopes.items() if v > 0}
    trending_down = {k: v for k, v in slopes.items() if v < 0}
    
    results['trend_direction'] = {
        'trending_up': dict(sorted(trending_up.items(), key=lambda x: x[1], reverse=True)),
        'trending_down': dict(sorted(trending_down.items(), key=lambda x: x[1]))
    }
    
    # 3. Weekly patterns analysis
    # Add day of week if the index is a DatetimeIndex
    if isinstance(trends_df.index, pd.DatetimeIndex):
        day_of_week = trends_df.index.dayofweek
        weekly_patterns = {}
        
        for column in trends_df.columns:
            # Create temporary dataframe with day of week
            temp_df = pd.DataFrame({'value': trends_df[column], 'day_of_week': day_of_week})
            day_avg = temp_df.groupby('day_of_week')['value'].mean()
            peak_day = day_avg.idxmax()
            day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            
            # Calculate weekend vs weekday ratio
            weekend_mask = day_of_week.isin([5, 6])  # 5, 6 are Saturday and Sunday
            weekend_avg = trends_df[column][weekend_mask].mean()
            weekday_avg = trends_df[column][~weekend_mask].mean()
            weekend_ratio = weekend_avg / weekday_avg if weekday_avg > 0 else 0
            
            weekly_patterns[column] = {
                'peak_day': day_names[peak_day],
                'peak_value': day_avg.max(),
                'low_day': day_names[day_avg.idxmin()],
                'low_value': day_avg.min(),
                'weekend_vs_weekday': weekend_ratio
            }
        
        results['weekly_patterns'] = weekly_patterns
    else:
        results['weekly_patterns'] = {}
        print("Warning: DataFrame index is not DatetimeIndex, skipping weekly patterns analysis")
    
    # 4. Simple keyword grouping based on correlation (instead of K-means clustering)
    corr_matrix = trends_df.corr()
    
    # Group keywords with similar correlation patterns
    groups = {}
    visited = set()
    
    for keyword in trends_df.columns:
        if keyword in visited:
            continue
            
        # Find highly correlated keywords
        related = corr_matrix[keyword][corr_matrix[keyword] > 0.7].index.tolist()
        
        # Remove self-correlation
        if keyword in related:
            related.remove(keyword)
            
        # Add to group if there are related keywords
        if related:
            group_id = len(groups)
            groups[group_id] = [keyword] + related
            visited.add(keyword)
            visited.update(related)
        else:
            # Create a group with just this keyword
            group_id = len(groups)
            groups[group_id] = [keyword]
            visited.add(keyword)
    
    results['keyword_groups'] = groups
    
    # 5. Correlation-based recommendations
    keyword_correlations = {}
    for keyword in trends_df.columns:
        correlations = corr_matrix[keyword].drop(keyword).nlargest(3)
        keyword_correlations[keyword] = correlations.to_dict()
    
    results['related_keywords'] = keyword_correlations
    
    # 6. Create a heatmap visualization of keyword correlations
    plt.figure(figsize=(14, 12))
    
    # Plot correlation heatmap
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    
    sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
                square=True, linewidths=.5, annot=False, cbar_kws={"shrink": .8})
    
    plt.title('Correlation Between Digital Marketing Trends', fontsize=16)
    plt.tight_layout()
    
    # Save the visualization
    import os
    if not os.path.exists('trends_visualizations'):
        os.makedirs('trends_visualizations')
        
    plt.savefig('trends_visualizations/keyword_correlation_analysis.png', dpi=300)
    plt.close()
    
    return results

def generate_insights_report(analysis_results, output_file="digital_marketing_trends_insights.md"):
    """
    Generate a markdown report from the analysis results.
    
    Parameters:
    analysis_results (dict): Results from the analyze_marketing_trends function
    output_file (str): Path to save the markdown report
    """
    with open(output_file, 'w') as f:
        f.write("# Digital Marketing Trends Analysis (Q1 2025)\n\n")
        
        # Basic stats section
        f.write("## Key Findings\n\n")
        stats = analysis_results['basic_stats']
        f.write(f"- **Highest Average Interest:** {stats['highest_avg_interest']}\n")
        f.write(f"- **Lowest Average Interest:** {stats['lowest_avg_interest']}\n")
        f.write(f"- **Most Volatile Topic:** {stats['most_volatile']}\n")
        f.write(f"- **Most Stable Topic:** {stats['most_stable']}\n\n")
        
        # Trend direction
        f.write("## Trending Topics\n\n")
        f.write("### Topics Gaining Popularity\n\n")
        for topic, slope in list(analysis_results['trend_direction']['trending_up'].items())[:5]:
            f.write(f"- **{topic}** (slope: {slope:.4f})\n")
        
        f.write("\n### Topics Declining in Interest\n\n")
        for topic, slope in list(analysis_results['trend_direction']['trending_down'].items())[:5]:
            f.write(f"- **{topic}** (slope: {slope:.4f})\n")
        
        # Weekly patterns
        if analysis_results['weekly_patterns']:
            f.write("\n## Weekly Search Patterns\n\n")
            f.write("| Keyword | Peak Day | Weekend/Weekday Ratio |\n")
            f.write("|---------|----------|----------------------|\n")
            
            for keyword, data in analysis_results['weekly_patterns'].items():
                f.write(f"| {keyword} | {data['peak_day']} | {data['weekend_vs_weekday']:.2f} |\n")
        
        # Keyword groups
        f.write("\n## Keyword Groups\n\n")
        f.write("These groups represent terms that show similar search patterns:\n\n")
        
        for group_id, keywords in analysis_results['keyword_groups'].items():
            f.write(f"### Group {group_id+1}\n\n")
            for keyword in keywords:
                f.write(f"- {keyword}\n")
            f.write("\n")
        
        # Related keywords
        f.write("## Related Keywords\n\n")
        f.write("For each keyword, here are the most closely related terms based on search patterns:\n\n")
        
        for keyword, related in analysis_results['related_keywords'].items():
            f.write(f"### {keyword}\n\n")
            for related_keyword, correlation in related.items():
                f.write(f"- {related_keyword} (correlation: {correlation:.2f})\n")
            f.write("\n")
        
        f.write("\n## Marketing Recommendations\n\n")
        f.write("Based on this analysis, consider the following strategies:\n\n")
        
        # Generate some basic recommendations
        trending_topics = list(analysis_results['trend_direction']['trending_up'].keys())[:3]
        f.write(f"1. **Focus on rising trends**: Create content around {', '.join(trending_topics)}\n")
        
        # Find topics with strong weekend presence (if available)
        if analysis_results['weekly_patterns']:
            weekend_topics = sorted(
                [(k, v['weekend_vs_weekday']) for k, v in analysis_results['weekly_patterns'].items()],
                key=lambda x: x[1], reverse=True
            )[:3]
            
            f.write(f"2. **Weekend campaigns**: Target {', '.join([t[0] for t in weekend_topics])} on weekends\n")
        
        # Find complementary topics from groups
        for group_id, keywords in analysis_results['keyword_groups'].items():
            if len(keywords) >= 2:
                f.write(f"3. **Bundle related topics**: Create integrated campaigns that combine {' and '.join(keywords[:2])}\n")
                break
        
        f.write("4. **Counter-cyclical opportunity**: Consider investing in declining topics with lower competition\n")
    
    print(f"✓ Insights report generated: {output_file}")

# Usage example
if __name__ == "__main__":
    try:
        trends_df = pd.read_csv('digital_marketing_trends_q1_2025.csv', index_col=0, parse_dates=True)
        analysis_results = analyze_marketing_trends(trends_df)
        generate_insights_report(analysis_results)
    except FileNotFoundError:
        print("Please run the data collection script first to generate the CSV file.")

✓ Insights report generated: digital_marketing_trends_insights.md
