# Weather Impact on Food Delivery - Free API Data Collection

This notebook uses completely free APIs and public datasets:
1. Open-Meteo API for weather data (no API key required)
2. Yelp Academic Dataset for restaurant data
3. Public food delivery datasets from Kaggle
4. Synthetic data generation for missing components

In [1]:
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import json
import time
from tqdm import tqdm

## 1. Weather Data Collection using Open-Meteo API

Open-Meteo is a free weather API that doesn't require an API key

In [6]:
def get_weather_data(latitude, longitude, start_date, end_date):
    """Collect weather data from National Weather Service API"""
    # First, get the grid endpoint for the location
    point_url = f"https://api.weather.gov/points/{latitude},{longitude}"
    
    try:
        # Get grid endpoint
        response = requests.get(point_url, headers={'User-Agent': 'WeatherStudyProject'})
        response.raise_for_status()
        grid_data = response.json()
        
        # Get the forecast grid endpoint
        forecast_url = grid_data['properties']['forecastGridData']
        
        # Get weather data
        response = requests.get(forecast_url, headers={'User-Agent': 'WeatherStudyProject'})
        response.raise_for_status()
        weather_data = response.json()
        
        # Extract relevant weather properties
        properties = weather_data['properties']
        
        # Create a DataFrame with hourly data
        data = {
            'temperature': properties['temperature']['values'],
            'precipitation': properties['probabilityOfPrecipitation']['values'],
            'humidity': properties['relativeHumidity']['values'],
            'windSpeed': properties['windSpeed']['values']
        }
        
        df = pd.DataFrame()
        
        # Process each weather parameter
        for param, values in data.items():
            temp_df = pd.DataFrame(values)
            temp_df['parameter'] = param
            temp_df['validTime'] = pd.to_datetime(temp_df['validTime'].str.split('/').str[0])
            temp_df = temp_df.pivot(index='validTime', columns='parameter', values='value')
            
            if df.empty:
                df = temp_df
            else:
                df = df.join(temp_df)
        
        return df
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching weather data for coordinates {latitude},{longitude}: {e}")
        return None

# Define major US cities with their coordinates
cities = {
    'New York': {'lat': 40.7128, 'lon': -74.0060},
    'Los Angeles': {'lat': 34.0522, 'lon': -118.2437},
    'Chicago': {'lat': 41.8781, 'lon': -87.6298},
    'Houston': {'lat': 29.7604, 'lon': -95.3698},
    'Phoenix': {'lat': 33.4484, 'lon': -112.0740}
}

# Collect weather data for each city
weather_data = []
for city, coords in tqdm(cities.items()):
    print(f"\nCollecting data for {city}")
    df = get_weather_data(coords['lat'], coords['lon'], None, None)
    
    if df is not None:
        df['city'] = city
        weather_data.append(df)
    
    time.sleep(2)  

if weather_data:
    weather_df = pd.concat(weather_data, ignore_index=True)
    print("\nSaving weather data...")
    weather_df.to_csv('../data/weather_data.csv', index=False)
    print("Weather data saved successfully!")

  0%|          | 0/5 [00:00<?, ?it/s]


Collecting data for New York


 20%|██        | 1/5 [00:02<00:11,  2.92s/it]


Collecting data for Los Angeles


 40%|████      | 2/5 [00:05<00:07,  2.62s/it]


Collecting data for Chicago


 60%|██████    | 3/5 [00:07<00:04,  2.48s/it]


Collecting data for Houston


 80%|████████  | 4/5 [00:10<00:02,  2.63s/it]


Collecting data for Phoenix


100%|██████████| 5/5 [00:13<00:00,  2.63s/it]


Saving weather data...
Weather data saved successfully!





In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data and examine structure
print("Loading data and examining structure...")
weather_df = pd.read_csv('../data/weather_data.csv')
restaurant_df = pd.read_csv('../data/restaurant_data.csv')

print("\nWeather Data Columns:")
print(weather_df.columns.tolist())
print("\nFirst few rows of weather data:")
print(weather_df.head())

print("\nRestaurant Data Columns:")
print(restaurant_df.columns.tolist())
print("\nFirst few rows of restaurant data:")
print(restaurant_df.head())



### check data quality 

Loading data and examining structure...

Weather Data Columns:
['temperature', 'precipitation', 'humidity', 'windSpeed', 'city']

First few rows of weather data:
   temperature  precipitation  humidity  windSpeed      city
0     1.666667            0.0      56.0     22.224  New York
1     2.222222            NaN      59.0     24.076  New York
2     2.777778            NaN      61.0     25.928  New York
3     3.333333            NaN      57.0        NaN  New York
4     3.888889            NaN      54.0        NaN  New York

Restaurant Data Columns:
['restaurant_id', 'name', 'city', 'cuisine', 'latitude', 'longitude', 'takeaway', 'delivery', 'opening_hours']

First few rows of restaurant data:
   restaurant_id               name      city  \
0      296568074     The Brass Rail  New York   
1      305499273       Court Street  New York   
2      357620442          Sam Sunny  New York   
3      380044344            Pedro's  New York   
4      410235438  Trudy's Ice Cream  New York   

    

## 2. Restaurant Data Collection

Using publicly available restaurant data and generating synthetic delivery metrics

In [7]:
# Define major US cities with their coordinates
cities = {
    'New York': {'lat': 40.7128, 'lon': -74.0060},
    'Los Angeles': {'lat': 34.0522, 'lon': -118.2437},
    'Chicago': {'lat': 41.8781, 'lon': -87.6298},
    'Houston': {'lat': 29.7604, 'lon': -95.3698},
    'Phoenix': {'lat': 33.4484, 'lon': -112.0740}
}

def get_restaurant_data(cities):
    """Collect restaurant data using OpenStreetMap Overpass API (no key needed)"""
    import requests
    import time
    from tqdm import tqdm
    import pandas as pd
    
    # Overpass API endpoint
    overpass_url = "https://overpass-api.de/api/interpreter"
    restaurants = []
    
    for city, coords in tqdm(cities.items()):
        # Create a bounding box around the city coordinates (roughly 10km radius)
        lat, lon = coords['lat'], coords['lon']
        radius = 0.1  # roughly 10km in degrees
        bbox = f"{lat-radius},{lon-radius},{lat+radius},{lon+radius}"
        
        # Overpass query to get restaurants
        overpass_query = f"""
        [out:json][timeout:25];
        (
          node["amenity"="restaurant"]({bbox});
          way["amenity"="restaurant"]({bbox});
          relation["amenity"="restaurant"]({bbox});
        );
        out body;
        >;
        out skel qt;
        """
        
        try:
            response = requests.post(overpass_url, data={"data": overpass_query})
            response.raise_for_status()
            data = response.json()
            
            for element in data.get('elements', []):
                if element.get('type') == 'node':  # Only process node elements for simplicity
                    tags = element.get('tags', {})
                    restaurant = {
                        'restaurant_id': str(element['id']),
                        'name': tags.get('name', 'Unknown'),
                        'city': city,
                        'cuisine': tags.get('cuisine', 'Unknown'),
                        'latitude': element['lat'],
                        'longitude': element['lon'],
                        'takeaway': tags.get('takeaway', 'Unknown'),
                        'delivery': tags.get('delivery', 'Unknown'),
                        'opening_hours': tags.get('opening_hours', 'Unknown')
                    }
                    restaurants.append(restaurant)
            
            # Be nice to the API
            time.sleep(2)
            
        except Exception as e:
            print(f"Error fetching data for {city}: {e}")
            continue
    
    return pd.DataFrame(restaurants)

# Get restaurant data
print("Collecting restaurant data from OpenStreetMap...")
restaurant_df = get_restaurant_data(cities)

# Clean up the data
restaurant_df = restaurant_df[restaurant_df['name'] != 'Unknown']  # Remove unnamed restaurants

# Save to CSV
print("\nSaving restaurant data...")
restaurant_df.to_csv('../data/restaurant_data.csv', index=False)
print("Restaurant data saved successfully!")

# Display some statistics
print("\nRestaurant Data Summary:")
print(f"Total restaurants: {len(restaurant_df)}")
print("\nRestaurants per city:")
print(restaurant_df['city'].value_counts())
print("\nCuisine types:")
print(restaurant_df['cuisine'].value_counts().head())

Collecting restaurant data from OpenStreetMap...


100%|██████████| 5/5 [00:21<00:00,  4.20s/it]


Saving restaurant data...
Restaurant data saved successfully!

Restaurant Data Summary:
Total restaurants: 9022

Restaurants per city:
city
New York       5475
Chicago        1721
Los Angeles    1065
Houston         455
Phoenix         306
Name: count, dtype: int64

Cuisine types:
cuisine
Unknown    2181
mexican     653
pizza       566
italian     482
chinese     471
Name: count, dtype: int64





## Merge Data

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# 1. Load data
print("Loading data...")
weather_df = pd.read_csv('../data/weather_data.csv')
restaurant_df = pd.read_csv('../data/restaurant_data.csv')

# 2. Clean and prepare data
def clean_restaurant_data(df):
    df = df.copy()
    
    # Convert delivery/takeaway to binary
    df['has_delivery'] = df['delivery'].map({'yes': 1, 'no': 0, 'Unknown': 0})
    df['has_takeaway'] = df['takeaway'].map({'yes': 1, 'no': 0, 'Unknown': 0})
    
    # Clean cuisine
    df['cuisine'] = df['cuisine'].fillna('Unknown')
    
    # Extract opening hours where available
    df['has_opening_hours'] = df['opening_hours'].ne('Unknown').astype(int)
    
    return df

def clean_weather_data(df):
    df = df.copy()
    
    # Convert temperature to Fahrenheit if it's in Celsius
    if df['temperature'].mean() < 10:  # Likely Celsius
        df['temperature_F'] = df['temperature'] * 9/5 + 32
    else:
        df['temperature_F'] = df['temperature']
    
    # Fill missing values
    df['precipitation'] = df['precipitation'].fillna(0)
    df['windSpeed'] = df['windSpeed'].fillna(df['windSpeed'].mean())
    df['humidity'] = df['humidity'].fillna(df['humidity'].mean())
    
    return df

# Clean the data
print("\nCleaning data...")
weather_clean = clean_weather_data(weather_df)
restaurant_clean = clean_restaurant_data(restaurant_df)

# 3. Create master dataset
def create_master_dataset(weather_df, restaurant_df):
    """Merge weather and restaurant data based on city"""
    
    # Create master data
    master_data = []
    
    # Get current timestamp for reference
    current_time = datetime.now()
    
    for city in weather_df['city'].unique():
        # Get weather data for this city
        city_weather = weather_df[weather_df['city'] == city]
        # Get restaurants in this city
        city_restaurants = restaurant_df[restaurant_df['city'] == city]
        
        # For each weather record
        for _, weather in city_weather.iterrows():
            # For each restaurant in the city
            for _, restaurant in city_restaurants.iterrows():
                master_data.append({
                    # Restaurant info
                    'restaurant_id': restaurant['restaurant_id'],
                    'restaurant_name': restaurant['name'],
                    'city': city,
                    'cuisine': restaurant['cuisine'],
                    'latitude': restaurant['latitude'],
                    'longitude': restaurant['longitude'],
                    'has_delivery': restaurant['has_delivery'],
                    'has_takeaway': restaurant['has_takeaway'],
                    'has_opening_hours': restaurant['has_opening_hours'],
                    
                    # Weather info
                    'temperature_F': weather['temperature_F'],
                    'precipitation': weather['precipitation'],
                    'humidity': weather['humidity'],
                    'wind_speed': weather['windSpeed']
                })
    
    return pd.DataFrame(master_data)

# Create master dataset
print("Creating master dataset...")
master_df = create_master_dataset(weather_clean, restaurant_clean)

# Save master dataset
print("\nSaving master dataset...")
master_df.to_csv('../data/master_delivery_data.csv', index=False)

# 4. Basic Analysis
print("\nGenerating basic analysis...")

# Restaurant distribution
plt.figure(figsize=(10, 6))
master_df['city'].value_counts().plot(kind='bar')
plt.title('Number of Restaurants by City')
plt.xlabel('City')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../data/restaurant_distribution.png')
plt.close()

# Temperature distribution
plt.figure(figsize=(10, 6))
sns.boxplot(x='city', y='temperature_F', data=master_df)
plt.title('Temperature Distribution by City')
plt.xlabel('City')
plt.ylabel('Temperature (°F)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../data/temperature_distribution.png')
plt.close()

# Cuisine distribution
plt.figure(figsize=(12, 6))
master_df['cuisine'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Cuisine Types')
plt.xlabel('Cuisine')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../data/cuisine_distribution.png')
plt.close()

# Generate summary statistics
summary_stats = {
    'Total Records': len(master_df),
    'Unique Restaurants': master_df['restaurant_id'].nunique(),
    'Cities Covered': master_df['city'].nunique(),
    'Average Temperature': f"{master_df['temperature_F'].mean():.1f}°F",
    'Restaurants with Delivery': f"{(master_df['has_delivery'] == 1).sum()} ({(master_df['has_delivery'] == 1).mean()*100:.1f}%)",
    'Most Common Cuisine': master_df['cuisine'].mode()[0]
}

# Save summary
with open('../data/master_data_summary.txt', 'w') as f:
    f.write("Master Dataset Summary\n")
    f.write("====================\n\n")
    for key, value in summary_stats.items():
        f.write(f"{key}: {value}\n")

print("\nMaster dataset created successfully!")
print("Files generated:")
print("1. master_delivery_data.csv - Complete merged dataset")
print("2. restaurant_distribution.png - Restaurant distribution visualization")
print("3. temperature_distribution.png - Temperature distribution visualization")
print("4. cuisine_distribution.png - Cuisine distribution visualization")
print("5. master_data_summary.txt - Summary statistics")

# Display key statistics
print("\nKey Statistics:")
for key, value in summary_stats.items():
    print(f"{key}: {value}")

Loading data...

Cleaning data...
Creating master dataset...

Saving master dataset...

Generating basic analysis...

Master dataset created successfully!
Files generated:
1. master_delivery_data.csv - Complete merged dataset
2. restaurant_distribution.png - Restaurant distribution visualization
3. temperature_distribution.png - Temperature distribution visualization
4. cuisine_distribution.png - Cuisine distribution visualization
5. master_data_summary.txt - Summary statistics

Key Statistics:
Total Records: 1063620
Unique Restaurants: 9022
Cities Covered: 5
Average Temperature: 39.7°F
Restaurants with Delivery: 124865 (11.7%)
Most Common Cuisine: Unknown


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the master dataset
print("Loading master dataset...")
master_df = pd.read_csv('../data/master_delivery_data.csv')

# Create weather-based delivery analysis directory
import os
if not os.path.exists('../weather_delivery_analysis'):
    os.makedirs('../weather_delivery_analysis')

# Set figure style
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 100
plt.rcParams['axes.grid'] = True

# 1. Temperature Impact on Delivery
plt.figure()
master_df['temp_category'] = pd.cut(
    master_df['temperature_F'],
    bins=[0, 32, 50, 68, 86, 100],
    labels=['Very Cold', 'Cold', 'Moderate', 'Warm', 'Hot']
)
sns.boxplot(data=master_df, x='temp_category', y='has_delivery')
plt.title('Temperature Impact on Delivery Service Usage', pad=20)
plt.xlabel('Temperature Category')
plt.ylabel('Delivery Service Usage Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/1_temp_impact.png')
plt.close()

# 2. Precipitation vs Delivery
plt.figure()
master_df['rain_category'] = pd.cut(
    master_df['precipitation'],
    bins=[-np.inf, 0, 0.1, 0.3, np.inf],
    labels=['No Rain', 'Light Rain', 'Moderate Rain', 'Heavy Rain']
)
sns.barplot(data=master_df, x='rain_category', y='has_delivery')
plt.title('Precipitation Impact on Delivery Service', pad=20)
plt.xlabel('Precipitation Level')
plt.ylabel('Delivery Service Usage Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/2_rain_impact.png')
plt.close()

# 3. Wind Speed Effect
plt.figure()
master_df['wind_category'] = pd.cut(
    master_df['wind_speed'],
    bins=[0, 5, 10, 15, np.inf],
    labels=['Light', 'Moderate', 'Strong', 'Very Strong']
)
sns.barplot(data=master_df, x='wind_category', y='has_delivery')
plt.title('Wind Speed Impact on Delivery Service', pad=20)
plt.xlabel('Wind Speed Category')
plt.ylabel('Delivery Service Usage Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/3_wind_impact.png')
plt.close()

# 4. Weather Conditions Matrix
plt.figure(figsize=(15, 10))
weather_delivery_matrix = pd.crosstab(
    master_df['temp_category'],
    master_df['rain_category'],
    values=master_df['has_delivery'],
    aggfunc='mean'
)
sns.heatmap(weather_delivery_matrix, annot=True, cmap='YlOrRd', fmt='.2f')
plt.title('Delivery Service Usage by Temperature and Precipitation', pad=20)
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/4_weather_matrix.png')
plt.close()

# 5. Cuisine-specific Weather Impact
plt.figure(figsize=(15, 8))
top_cuisines = master_df['cuisine'].value_counts().head(5).index
cuisine_weather = master_df[master_df['cuisine'].isin(top_cuisines)]
sns.boxplot(data=cuisine_weather, x='cuisine', y='has_delivery', hue='temp_category')
plt.title('Weather Impact on Delivery by Cuisine Type', pad=20)
plt.xlabel('Cuisine Type')
plt.ylabel('Delivery Service Usage Rate')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/5_cuisine_weather.png')
plt.close()

# 6. Weather Impact by City
plt.figure(figsize=(15, 8))
sns.boxplot(data=master_df, x='city', y='has_delivery', hue='temp_category')
plt.title('Weather Impact on Delivery by City', pad=20)
plt.xlabel('City')
plt.ylabel('Delivery Service Usage Rate')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/6_city_weather.png')
plt.close()

# 7. Combined Weather Effects
plt.figure(figsize=(12, 8))
pivot_table = pd.pivot_table(
    master_df, 
    values='has_delivery',
    index='temp_category',
    columns='rain_category',
    aggfunc='mean'
)
sns.heatmap(pivot_table, annot=True, cmap='coolwarm', center=0.5, fmt='.2f')
plt.title('Combined Weather Effects on Delivery Service', pad=20)
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/7_combined_weather.png')
plt.close()

# 8. Weather Correlation Analysis
plt.figure(figsize=(10, 8))
weather_cols = ['temperature_F', 'precipitation', 'humidity', 'wind_speed', 'has_delivery']
correlation = master_df[weather_cols].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Weather Factors Correlation with Delivery', pad=20)
plt.tight_layout()
plt.savefig('../weather_delivery_analysis/8_correlation.png')
plt.close()

# Generate statistical analysis
weather_stats = {
    'Temperature Effect': {
        'correlation': master_df['temperature_F'].corr(master_df['has_delivery']),
        'highest_delivery_temp': master_df.groupby('temp_category')['has_delivery'].mean().idxmax(),
        'lowest_delivery_temp': master_df.groupby('temp_category')['has_delivery'].mean().idxmin()
    },
    'Precipitation Effect': {
        'correlation': master_df['precipitation'].corr(master_df['has_delivery']),
        'rain_vs_no_rain': (
            master_df[master_df['precipitation'] > 0]['has_delivery'].mean() /
            master_df[master_df['precipitation'] == 0]['has_delivery'].mean()
        )
    },
    'Wind Effect': {
        'correlation': master_df['wind_speed'].corr(master_df['has_delivery']),
        'optimal_wind_speed': master_df.groupby('wind_category')['has_delivery'].mean().idxmax()
    }
}

# Save analysis results
with open('../weather_delivery_analysis/weather_impact_analysis.txt', 'w') as f:
    f.write("Weather Impact on Food Delivery Analysis\n")
    f.write("=====================================\n\n")
    
    f.write("Temperature Effects:\n")
    f.write(f"- Correlation with delivery: {weather_stats['Temperature Effect']['correlation']:.3f}\n")
    f.write(f"- Highest delivery rate: {weather_stats['Temperature Effect']['highest_delivery_temp']}\n")
    f.write(f"- Lowest delivery rate: {weather_stats['Temperature Effect']['lowest_delivery_temp']}\n\n")
    
    f.write("Precipitation Effects:\n")
    f.write(f"- Correlation with delivery: {weather_stats['Precipitation Effect']['correlation']:.3f}\n")
    f.write(f"- Rain vs No Rain delivery ratio: {weather_stats['Precipitation Effect']['rain_vs_no_rain']:.2f}\n\n")
    
    f.write("Wind Effects:\n")
    f.write(f"- Correlation with delivery: {weather_stats['Wind Effect']['correlation']:.3f}\n")
    f.write(f"- Optimal wind conditions: {weather_stats['Wind Effect']['optimal_wind_speed']}\n")

print("Weather-based delivery analysis complete! Check the 'weather_delivery_analysis' folder for:")
print("1. Temperature impact visualization")
print("2. Precipitation impact visualization")
print("3. Wind speed impact visualization")
print("4. Weather conditions matrix")
print("5. Cuisine-specific weather impact")
print("6. City-specific weather impact")
print("7. Combined weather effects")
print("8. Weather correlation analysis")
print("9. Detailed statistical analysis in 'weather_impact_analysis.txt'")

Loading master dataset...


  pivot_table = pd.pivot_table(
  'highest_delivery_temp': master_df.groupby('temp_category')['has_delivery'].mean().idxmax(),
  'lowest_delivery_temp': master_df.groupby('temp_category')['has_delivery'].mean().idxmin()


Weather-based delivery analysis complete! Check the 'weather_delivery_analysis' folder for:
1. Temperature impact visualization
2. Precipitation impact visualization
3. Wind speed impact visualization
4. Weather conditions matrix
5. Cuisine-specific weather impact
6. City-specific weather impact
7. Combined weather effects
8. Weather correlation analysis
9. Detailed statistical analysis in 'weather_impact_analysis.txt'


  'optimal_wind_speed': master_df.groupby('wind_category')['has_delivery'].mean().idxmax()
