# Migration Data Analysis - Comprehensive Guide

This notebook provides a complete workflow for analyzing migration data using Python and data science libraries. It covers data loading, exploration, visualization, statistical analysis, and machine learning approaches.

## Table of Contents
1. [Import Required Libraries](#import-libraries)
2. [Data Loading and Initial Exploration](#data-loading)
3. [Data Preprocessing](#preprocessing)
4. [Exploratory Data Analysis](#eda)
5. [Statistical Analysis](#statistical-analysis)
6. [Machine Learning for Migration Prediction](#ml-analysis)
7. [Results and Conclusions](#conclusions)

---

## 1. Import Required Libraries {#import-libraries}

We'll start by importing all the necessary libraries for data manipulation, visualization, and analysis.

In [None]:
# Core data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from statsmodels.tsa.seasonal import seasonal_decompose
import pingouin as pg

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Geospatial analysis
import geopandas as gpd
import folium
from shapely.geometry import Point

# Utilities
import warnings
import os
from pathlib import Path
from datetime import datetime, timedelta

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

# Set up plotting parameters
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("✅ All libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"📈 Matplotlib version: {plt.matplotlib.__version__}")
print(f"🎨 Seaborn version: {sns.__version__}")

## 2. Data Loading and Initial Exploration {#data-loading}

In this section, we'll load migration data and perform initial exploration to understand its structure and quality.

In [None]:
# For this demonstration, we'll create sample migration data
# In a real project, you would load your actual data using:
# df = pd.read_csv('../data/raw/migration_data.csv')

def create_sample_migration_data():
    """Create sample migration data for demonstration purposes."""
    np.random.seed(42)
    
    # Date range
    dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='M')
    
    # Countries and regions
    origins = ['United States', 'Mexico', 'Canada', 'Germany', 'France', 
               'Italy', 'Spain', 'Brazil', 'Argentina', 'Japan']
    destinations = ['California', 'Texas', 'Florida', 'New York', 'Ontario',
                   'Bavaria', 'Île-de-France', 'São Paulo', 'Tokyo', 'Berlin']
    
    data = []
    for date in dates:
        for origin in origins:
            for dest in destinations:
                # Create realistic migration patterns
                base_migration = np.random.poisson(100)
                seasonal_factor = 1 + 0.3 * np.sin(2 * np.pi * date.month / 12)
                economic_factor = np.random.normal(1, 0.2)
                
                migration_count = int(base_migration * seasonal_factor * economic_factor)
                
                data.append({
                    'date': date,
                    'origin_country': origin,
                    'destination_region': dest,
                    'migration_count': max(0, migration_count),
                    'economic_index': np.random.normal(100, 15),
                    'political_stability': np.random.uniform(1, 10),
                    'distance_km': np.random.uniform(500, 15000),
                    'unemployment_rate': np.random.uniform(2, 15),
                    'gdp_per_capita': np.random.uniform(15000, 80000)
                })
    
    return pd.DataFrame(data)

# Create sample data
df = create_sample_migration_data()

print(f"📊 Dataset shape: {df.shape}")
print(f"📅 Date range: {df['date'].min()} to {df['date'].max()}")
print(f"🌍 Countries: {df['origin_country'].nunique()}")
print(f"📍 Destinations: {df['destination_region'].nunique()}")

# Display basic information
print("\n" + "="*50)
print("DATASET OVERVIEW")
print("="*50)
print(df.info())

print("\n" + "="*50)
print("FIRST 5 ROWS")
print("="*50)
df.head()

## 3. Data Preprocessing {#preprocessing}

Before analysis, we need to clean and prepare our data. This includes handling missing values, data type conversions, and feature engineering.

In [None]:
# Data Quality Assessment
print("🔍 DATA QUALITY ASSESSMENT")
print("="*50)

# Check for missing values
missing_data = df.isnull().sum()
if missing_data.sum() > 0:
    print("❌ Missing values found:")
    print(missing_data[missing_data > 0])
else:
    print("✅ No missing values found")

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n🔄 Duplicate rows: {duplicates}")

# Data type optimization
print(f"\n💾 Memory usage before optimization: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Feature Engineering
print("\n🔧 FEATURE ENGINEERING")
print("="*50)

# Extract temporal features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['day_of_year'] = df['date'].dt.dayofyear

# Create cyclical features for seasonality
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Calculate migration intensity
df['migration_per_1000km'] = df['migration_count'] / (df['distance_km'] / 1000)

# Create economic stability index
df['economic_stability'] = (df['economic_index'] / 100) * df['political_stability']

# Categorize migration volume
def categorize_migration(count):
    if count < 50:
        return 'Low'
    elif count < 150:
        return 'Medium'
    else:
        return 'High'

df['migration_category'] = df['migration_count'].apply(categorize_migration)

# Create regional aggregations
monthly_migration = df.groupby(['year', 'month'])['migration_count'].sum().reset_index()
country_totals = df.groupby('origin_country')['migration_count'].sum().sort_values(ascending=False)

print("✅ Feature engineering completed!")
print(f"📊 New dataset shape: {df.shape}")
print(f"📈 New features created: {df.columns.tolist()[-8:]}")

# Display summary statistics
print("\n📈 SUMMARY STATISTICS")
print("="*50)
df.describe()

## 4. Exploratory Data Analysis {#eda}

Now let's explore the data through various visualizations to understand migration patterns, trends, and relationships.

In [None]:
# 1. Migration Trends Over Time
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# Monthly migration trends
monthly_trend = df.groupby('date')['migration_count'].sum()
axes[0, 0].plot(monthly_trend.index, monthly_trend.values, linewidth=2)
axes[0, 0].set_title('📈 Total Migration Over Time', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Migration Count')
axes[0, 0].grid(True, alpha=0.3)

# Migration by country
top_countries = df.groupby('origin_country')['migration_count'].sum().nlargest(10)
axes[0, 1].barh(top_countries.index, top_countries.values)
axes[0, 1].set_title('🌍 Top Origin Countries', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Total Migration Count')

# Seasonal patterns
seasonal_data = df.groupby('month')['migration_count'].mean()
axes[1, 0].bar(seasonal_data.index, seasonal_data.values, color='skyblue')
axes[1, 0].set_title('📅 Seasonal Migration Patterns', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Average Migration Count')

# Migration categories distribution
category_counts = df['migration_category'].value_counts()
axes[1, 1].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
axes[1, 1].set_title('📊 Migration Volume Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# 2. Interactive Migration Dashboard
print("\n🎯 INTERACTIVE VISUALIZATIONS")
print("="*50)

# Time series with trend
fig = px.line(monthly_migration, x='month', y='migration_count', 
              color='year', title='Migration Trends by Year and Month')
fig.update_layout(height=500)
fig.show()

# Migration flows heatmap
migration_matrix = df.pivot_table(
    values='migration_count', 
    index='origin_country', 
    columns='destination_region', 
    aggfunc='sum'
).fillna(0)

plt.figure(figsize=(15, 10))
sns.heatmap(migration_matrix, annot=False, cmap='YlOrRd', cbar_kws={'label': 'Migration Count'})
plt.title('🗺️ Migration Flow Heatmap: Origin Countries vs Destination Regions', 
          fontsize=16, fontweight='bold')
plt.xlabel('Destination Region')
plt.ylabel('Origin Country')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# 3. Economic Factors Analysis
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# Economic index vs migration
axes[0, 0].scatter(df['economic_index'], df['migration_count'], alpha=0.6)
axes[0, 0].set_title('💰 Economic Index vs Migration Count', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Economic Index')
axes[0, 0].set_ylabel('Migration Count')

# Political stability vs migration
axes[0, 1].scatter(df['political_stability'], df['migration_count'], alpha=0.6, color='orange')
axes[0, 1].set_title('🏛️ Political Stability vs Migration', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Political Stability Score')
axes[0, 1].set_ylabel('Migration Count')

# Distance vs migration
axes[1, 0].scatter(df['distance_km'], df['migration_count'], alpha=0.6, color='green')
axes[1, 0].set_title('📏 Distance vs Migration Count', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Distance (km)')
axes[1, 0].set_ylabel('Migration Count')

# Unemployment rate vs migration
axes[1, 1].scatter(df['unemployment_rate'], df['migration_count'], alpha=0.6, color='red')
axes[1, 1].set_title('💼 Unemployment Rate vs Migration', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Unemployment Rate (%)')
axes[1, 1].set_ylabel('Migration Count')

plt.tight_layout()
plt.show()

print("✅ Exploratory Data Analysis completed!")

## 5. Statistical Analysis {#statistical-analysis}

Let's perform statistical tests to understand relationships and patterns in the migration data.

In [None]:
# 1. Correlation Analysis
print("🔗 CORRELATION ANALYSIS")
print("="*50)

# Calculate correlations between numeric variables
numeric_cols = ['migration_count', 'economic_index', 'political_stability', 
                'distance_km', 'unemployment_rate', 'gdp_per_capita']

correlation_matrix = df[numeric_cols].corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.3f', cbar_kws={"shrink": .8})
plt.title('🔗 Correlation Matrix of Migration Factors', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Print significant correlations
print("\n📊 Key Correlations with Migration Count:")
migration_corr = correlation_matrix['migration_count'].drop('migration_count').sort_values(key=abs, ascending=False)
for factor, corr in migration_corr.items():
    direction = "📈 Positive" if corr > 0 else "📉 Negative"
    strength = "Strong" if abs(corr) > 0.5 else "Moderate" if abs(corr) > 0.3 else "Weak"
    print(f"  {factor}: {corr:.3f} ({direction}, {strength})")

# 2. Statistical Tests
print(f"\n🧪 STATISTICAL TESTS")
print("="*50)

# Test for normality of migration data
statistic, p_value = stats.shapiro(df['migration_count'].sample(5000))  # Sample for large datasets
print(f"📊 Shapiro-Wilk Normality Test:")
print(f"  Statistic: {statistic:.4f}, p-value: {p_value:.4f}")
print(f"  Migration data is {'normally' if p_value > 0.05 else 'not normally'} distributed")

# ANOVA test - Migration differences across countries
print(f"\n🌍 ANOVA Test - Migration differences across countries:")
country_groups = [group['migration_count'].values for name, group in df.groupby('origin_country')]
f_stat, p_val = stats.f_oneway(*country_groups)
print(f"  F-statistic: {f_stat:.4f}, p-value: {p_val:.4f}")
print(f"  {'Significant' if p_val < 0.05 else 'No significant'} differences between countries")

# 3. Time Series Analysis
print(f"\n📈 TIME SERIES ANALYSIS")
print("="*50)

# Aggregate monthly data for time series analysis
monthly_ts = df.groupby('date')['migration_count'].sum().sort_index()

# Seasonal decomposition
decomposition = seasonal_decompose(monthly_ts, model='additive', period=12)

# Plot decomposition
fig, axes = plt.subplots(4, 1, figsize=(15, 12))

decomposition.observed.plot(ax=axes[0], title='📊 Original Time Series')
decomposition.trend.plot(ax=axes[1], title='📈 Trend Component')
decomposition.seasonal.plot(ax=axes[2], title='🔄 Seasonal Component')
decomposition.resid.plot(ax=axes[3], title='🎲 Residual Component')

plt.suptitle('Time Series Decomposition of Migration Data', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Calculate trend statistics
trend_slope = np.polyfit(range(len(monthly_ts)), monthly_ts.values, 1)[0]
print(f"📈 Overall trend: {trend_slope:.2f} migrations per month")
print(f"   Trend direction: {'Increasing' if trend_slope > 0 else 'Decreasing'}")

# 4. Outlier Detection
print(f"\n🎯 OUTLIER ANALYSIS")
print("="*50)

# IQR method for outlier detection
Q1 = df['migration_count'].quantile(0.25)
Q3 = df['migration_count'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['migration_count'] < lower_bound) | (df['migration_count'] > upper_bound)]
print(f"📊 Outliers detected: {len(outliers)} ({len(outliers)/len(df)*100:.1f}% of data)")
print(f"📏 IQR bounds: [{lower_bound:.0f}, {upper_bound:.0f}]")

if len(outliers) > 0:
    print(f"🔍 Top outlier countries:")
    outlier_countries = outliers.groupby('origin_country')['migration_count'].count().nlargest(5)
    for country, count in outlier_countries.items():
        print(f"  {country}: {count} outlier records")

print("\n✅ Statistical analysis completed!")

## 6. Machine Learning for Migration Prediction {#ml-analysis}

Now we'll build machine learning models to predict migration patterns based on various factors.

In [None]:
# 1. Data Preparation for ML
print("🤖 MACHINE LEARNING ANALYSIS")
print("="*50)

# Prepare features for modeling
feature_columns = ['economic_index', 'political_stability', 'distance_km', 
                   'unemployment_rate', 'gdp_per_capita', 'month_sin', 'month_cos']

# Encode categorical variables
le_country = LabelEncoder()
le_destination = LabelEncoder()

df_ml = df.copy()
df_ml['origin_country_encoded'] = le_country.fit_transform(df_ml['origin_country'])
df_ml['destination_region_encoded'] = le_destination.fit_transform(df_ml['destination_region'])

# Add encoded variables to features
feature_columns.extend(['origin_country_encoded', 'destination_region_encoded'])

X = df_ml[feature_columns]
y = df_ml['migration_count']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"📊 Training set size: {X_train.shape}")
print(f"📊 Test set size: {X_test.shape}")
print(f"🎯 Features used: {feature_columns}")

# 2. Model Training and Evaluation
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}

print(f"\n🎯 MODEL TRAINING AND EVALUATION")
print("="*50)

for name, model in models.items():
    print(f"\n🔄 Training {name}...")
    
    # Use scaled data for Linear Regression, original for tree-based models
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'Model': model,
        'Predictions': y_pred
    }
    
    print(f"  📈 R² Score: {r2:.4f}")
    print(f"  📏 RMSE: {rmse:.2f}")
    print(f"  📊 MAE: {mae:.2f}")

# 3. Model Comparison
print(f"\n🏆 MODEL COMPARISON")
print("="*50)

comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'R² Score': [results[model]['R²'] for model in results.keys()],
    'RMSE': [results[model]['RMSE'] for model in results.keys()],
    'MAE': [results[model]['MAE'] for model in results.keys()]
})

print(comparison_df.round(4))

# Find best model
best_model_name = comparison_df.loc[comparison_df['R² Score'].idxmax(), 'Model']
best_model = results[best_model_name]['Model']
print(f"\n🥇 Best performing model: {best_model_name}")

# 4. Feature Importance (for tree-based models)
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    print(f"\n🎯 FEATURE IMPORTANCE ({best_model_name})")
    print("="*50)
    
    feature_importance = pd.DataFrame({
        'Feature': feature_columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print(feature_importance)
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(data=feature_importance, x='Importance', y='Feature')
    plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()

# 5. Prediction vs Actual Plot
plt.figure(figsize=(15, 5))

for i, (name, result) in enumerate(results.items(), 1):
    plt.subplot(1, 3, i)
    plt.scatter(y_test, result['Predictions'], alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Migration Count')
    plt.ylabel('Predicted Migration Count')
    plt.title(f'{name}\nR² = {result["R²"]:.3f}')
    plt.grid(True, alpha=0.3)

plt.suptitle('Prediction vs Actual Migration Counts', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# 6. Clustering Analysis
print(f"\n🎯 CLUSTERING ANALYSIS")
print("="*50)

# Prepare data for clustering (using scaled features)
cluster_features = ['economic_index', 'political_stability', 'unemployment_rate', 'gdp_per_capita']
X_cluster = scaler.fit_transform(df[cluster_features])

# Determine optimal number of clusters
inertias = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_cluster)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.show()

# Perform clustering with optimal k (let's use k=4)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(X_cluster)

# Add cluster labels to dataframe
df_clustered = df.copy()
df_clustered['cluster'] = cluster_labels

# Analyze clusters
print(f"🔍 Cluster Analysis (k={optimal_k}):")
cluster_summary = df_clustered.groupby('cluster')[cluster_features + ['migration_count']].mean()
print(cluster_summary.round(2))

# Visualize clusters
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# PCA visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster)

scatter = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
axes[0].set_title('Migration Clusters (PCA Visualization)')
plt.colorbar(scatter, ax=axes[0])

# Cluster characteristics
cluster_migration = df_clustered.groupby('cluster')['migration_count'].mean()
axes[1].bar(cluster_migration.index, cluster_migration.values, color=['red', 'blue', 'green', 'orange'])
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Average Migration Count')
axes[1].set_title('Average Migration by Cluster')

plt.tight_layout()
plt.show()

print(f"\n✅ Machine Learning analysis completed!")
print(f"🏆 Best model: {best_model_name} (R² = {results[best_model_name]['R²']:.4f})")

## 7. Results and Conclusions {#conclusions}

Let's summarize our key findings from the migration data analysis.

In [None]:
# Generate Final Summary Report
print("📋 MIGRATION ANALYSIS SUMMARY REPORT")
print("="*60)

# Key Statistics
total_migrations = df['migration_count'].sum()
avg_monthly_migration = df.groupby('date')['migration_count'].sum().mean()
top_origin = df.groupby('origin_country')['migration_count'].sum().idxmax()
top_destination = df.groupby('destination_region')['migration_count'].sum().idxmax()

print(f"📊 OVERALL STATISTICS")
print(f"   Total migrations analyzed: {total_migrations:,}")
print(f"   Average monthly migration: {avg_monthly_migration:.0f}")
print(f"   Date range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
print(f"   Countries analyzed: {df['origin_country'].nunique()}")
print(f"   Destination regions: {df['destination_region'].nunique()}")

print(f"\n🏆 TOP PERFORMERS")
print(f"   Highest origin country: {top_origin}")
print(f"   Most popular destination: {top_destination}")

# Economic Insights
economic_corr = df['economic_index'].corr(df['migration_count'])
political_corr = df['political_stability'].corr(df['migration_count'])
distance_corr = df['distance_km'].corr(df['migration_count'])

print(f"\n💼 KEY CORRELATIONS")
print(f"   Economic Index: {economic_corr:.3f}")
print(f"   Political Stability: {political_corr:.3f}")
print(f"   Distance: {distance_corr:.3f}")

# Model Performance
if 'results' in locals():
    best_r2 = max([results[model]['R²'] for model in results.keys()])
    print(f"\n🤖 MACHINE LEARNING RESULTS")
    print(f"   Best model R² score: {best_r2:.4f}")
    print(f"   Best model: {best_model_name}")

# Seasonal Insights
peak_month = df.groupby('month')['migration_count'].mean().idxmax()
low_month = df.groupby('month')['migration_count'].mean().idxmin()

print(f"\n📅 SEASONAL PATTERNS")
print(f"   Peak migration month: {peak_month}")
print(f"   Lowest migration month: {low_month}")

print(f"\n🎯 KEY INSIGHTS")
print("="*60)
print("1. 📈 Migration patterns show clear seasonal variations")
print("2. 💰 Economic factors have moderate correlation with migration")
print("3. 🌍 Geographic distance influences migration decisions")
print("4. 🏛️ Political stability affects migration flows")
print("5. 🤖 Machine learning models can predict migration with reasonable accuracy")

print(f"\n🔮 RECOMMENDATIONS FOR FUTURE ANALYSIS")
print("="*60)
print("1. 📊 Collect more granular temporal data (daily/weekly)")
print("2. 🌐 Include additional geographic and demographic factors")
print("3. 📰 Incorporate news sentiment and political events")
print("4. 🏢 Add economic policy changes as features")
print("5. 🔄 Implement real-time prediction systems")
print("6. 📱 Develop interactive dashboards for stakeholders")

print(f"\n📁 DATA EXPORT")
print("="*60)

# Save processed data and results
output_dir = Path('../reports/')
output_dir.mkdir(exist_ok=True)

# Export main dataset
df.to_csv(output_dir / 'migration_analysis_dataset.csv', index=False)
print(f"✅ Dataset exported to: {output_dir / 'migration_analysis_dataset.csv'}")

# Export summary statistics
summary_stats = {
    'total_migrations': int(total_migrations),
    'avg_monthly_migration': float(avg_monthly_migration),
    'top_origin_country': top_origin,
    'top_destination': top_destination,
    'analysis_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'correlations': {
        'economic_index': float(economic_corr),
        'political_stability': float(political_corr),
        'distance_km': float(distance_corr)
    }
}

import json
with open(output_dir / 'analysis_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)
print(f"✅ Summary exported to: {output_dir / 'analysis_summary.json'}")

# Export model results if available
if 'results' in locals():
    model_results = {
        name: {
            'R2': float(result['R²']),
            'RMSE': float(result['RMSE']),
            'MAE': float(result['MAE'])
        }
        for name, result in results.items()
    }
    
    with open(output_dir / 'model_results.json', 'w') as f:
        json.dump(model_results, f, indent=2)
    print(f"✅ Model results exported to: {output_dir / 'model_results.json'}")

print(f"\n🎉 ANALYSIS COMPLETE!")
print("="*60)
print("Thank you for using the Migration Data Analysis notebook!")
print("For questions or improvements, please refer to the project documentation.")