In [23]:
import numpy as np
import pandas as pd

In [24]:
df=pd.read_csv('location_count_with_start_time.csv')

In [25]:
df

Unnamed: 0,start_time,location,count
0,2025-08-27 00:00:00,Admin Lobby,2
1,2025-08-27 00:00:00,Auditorium,3
2,2025-08-27 00:00:00,Hostel,362
3,2025-08-27 00:00:00,LAB_102,1
4,2025-08-27 00:00:00,Lab,2
...,...,...,...
44942,2025-09-25 23:45:00,Auditorium,13
44943,2025-09-25 23:45:00,Admin Lobby,685
44944,2025-09-25 23:45:00,Seminar Room,5
44945,2025-09-25 23:45:00,LAB_101,5


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

def prepare_features(df):
    """
    Extract time features from start_time.
    No location encoding since we train separate models per location.
    """
    print("  Preparing features...")
    
    df['start_time'] = pd.to_datetime(df['start_time'])
    
    # Time features
    df['year'] = df['start_time'].dt.year
    df['month'] = df['start_time'].dt.month
    df['day'] = df['start_time'].dt.day
    df['hour'] = df['start_time'].dt.hour
    df['minute'] = df['start_time'].dt.minute
    df['day_of_week'] = df['start_time'].dt.dayofweek
    df['day_of_month'] = df['start_time'].dt.day
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['week_of_year'] = df['start_time'].dt.isocalendar().week
    df['time_in_minutes'] = df['hour'] * 60 + df['minute']
    
    # Time period categories
    df['time_period'] = pd.cut(df['hour'], 
                                bins=[0, 6, 12, 18, 24],
                                labels=['Night', 'Morning', 'Afternoon', 'Evening'],
                                include_lowest=True)
    
    # One-hot encode time_period
    time_period_dummies = pd.get_dummies(df['time_period'], prefix='period')
    df = pd.concat([df, time_period_dummies], axis=1)
    
    return df


def train_location_specific_models(csv_file, min_samples=50):
    """
    Train separate Random Forest model for EACH location using only that location's data.
    
    Parameters:
    - csv_file: Path to CSV file
    - min_samples: Minimum samples needed to train a model for a location
    
    Returns:
    - Dictionary of models (one per location)
    """
    
    print("="*80)
    print("LOCATION-SPECIFIC RANDOM FOREST MODELS")
    print("="*80)
    print("Training separate model for EACH location using only its own data\n")
    
    # Load data
    df = pd.read_csv(csv_file)
    print(f"✓ Loaded {len(df):,} total records")
    
    # Prepare features
    df = prepare_features(df)
    
    # Feature columns (no location dummies needed!)
    feature_cols = ['year', 'month', 'day', 'hour', 'minute', 'day_of_week', 
                    'day_of_month', 'is_weekend', 'week_of_year', 'time_in_minutes']
    period_cols = [col for col in df.columns if col.startswith('period_')]
    feature_cols.extend(period_cols)
    
    print(f"✓ Using {len(feature_cols)} features per model\n")
    
    # Get all locations
    all_locations = df['location'].unique()
    print(f"✓ Found {len(all_locations)} locations\n")
    
    # Train separate model for each location
    models = {}
    location_stats = []
    
    print("="*80)
    print("TRAINING MODELS...")
    print("="*80)
    
    for location in sorted(all_locations):
        print(f"\n📍 {location}")
        print("-" * 40)
        
        # Filter data for THIS location only
        location_df = df[df['location'] == location].copy()
        
        if len(location_df) < min_samples:
            print(f"  ⚠️  Skipped: Only {len(location_df)} samples (need {min_samples}+)")
            continue
        
        print(f"  ✓ Training with {len(location_df):,} samples from {location}")
        
        # Prepare data
        X = location_df[feature_cols]
        y = location_df['count']
        
        # Split data
        if len(location_df) < 100:
            test_size = 0.3  # Smaller datasets need more test data
        else:
            test_size = 0.2
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )
        
        # Train model
        model = RandomForestRegressor(
            n_estimators=100,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred_test = model.predict(X_test)
        
        mae = mean_absolute_error(y_test, y_pred_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        r2 = r2_score(y_test, y_pred_test)
        
        # Accuracy metrics
        acc_5 = (np.abs(y_test - y_pred_test) <= 5).sum() / len(y_test) * 100
        acc_10 = (np.abs(y_test - y_pred_test) <= 10).sum() / len(y_test) * 100
        acc_20 = (np.abs(y_test - y_pred_test) <= 20).sum() / len(y_test) * 100
        
        print(f"  ✓ Test MAE: {mae:.1f} | RMSE: {rmse:.1f} | R²: {r2:.3f}")
        print(f"  ✓ Accuracy: ±5: {acc_5:.1f}% | ±10: {acc_10:.1f}% | ±20: {acc_20:.1f}%")
        
        # Store model
        models[location] = {
            'model': model,
            'feature_cols': feature_cols,
            'samples': len(location_df),
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'acc_10': acc_10
        }
        
        location_stats.append({
            'Location': location,
            'Samples': len(location_df),
            'MAE': f"{mae:.1f}",
            'RMSE': f"{rmse:.1f}",
            'R²': f"{r2:.3f}",
            'Acc ±10': f"{acc_10:.1f}%"
        })
    
    # Summary
    print("\n" + "="*80)
    print("MODEL TRAINING SUMMARY")
    print("="*80)
    
    stats_df = pd.DataFrame(location_stats)
    print("\n" + stats_df.to_string(index=False))
    
    print(f"\n✓ Successfully trained {len(models)} location-specific models")
    
    # Save models
    with open('location_specific_models.pkl', 'wb') as f:
        pickle.dump(models, f)
    
    print(f"✓ All models saved to: location_specific_models.pkl")
    
    return models


def predict_count(models, location, start_time):
    """
    Predict count for a specific location and time using that location's model.
    
    Parameters:
    - models: Dictionary of trained models
    - location: Location name (e.g., 'Hostel')
    - start_time: Datetime string (e.g., '2025-09-26 14:00:00')
    
    Returns:
    - Predicted count (or None if location not found)
    """
    
    if location not in models:
        print(f"❌ No model found for location: {location}")
        return None
    
    # Get model for this location
    model_data = models[location]
    model = model_data['model']
    feature_cols = model_data['feature_cols']
    
    # Parse datetime
    dt = pd.to_datetime(start_time)
    
    # Extract features
    features = {
        'year': dt.year,
        'month': dt.month,
        'day': dt.day,
        'hour': dt.hour,
        'minute': dt.minute,
        'day_of_week': dt.dayofweek,
        'day_of_month': dt.day,
        'is_weekend': 1 if dt.dayofweek >= 5 else 0,
        'week_of_year': dt.isocalendar().week,
        'time_in_minutes': dt.hour * 60 + dt.minute
    }
    
    # Add time period
    hour = dt.hour
    if hour < 6:
        period = 'Night'
    elif hour < 12:
        period = 'Morning'
    elif hour < 18:
        period = 'Afternoon'
    else:
        period = 'Evening'
    
    for p in ['Night', 'Morning', 'Afternoon', 'Evening']:
        features[f'period_{p}'] = 1 if p == period else 0
    
    # Create DataFrame
    X_pred = pd.DataFrame([features])[feature_cols]
    
    # Predict using location-specific model
    prediction = model.predict(X_pred)[0]
    
    return max(0, round(prediction))


def batch_predict(models, predictions_list):
    """
    Make predictions for multiple locations/times.
    
    Parameters:
    - models: Dictionary of trained models
    - predictions_list: List of dicts with 'location' and 'start_time'
    
    Returns:
    - List of prediction results
    """
    
    results = []
    
    for pred in predictions_list:
        location = pred['location']
        start_time = pred['start_time']
        
        count = predict_count(models, location, start_time)
        
        if count is not None:
            results.append({
                'Location': location,
                'Time': start_time,
                'Predicted Count': count,
                'Model Accuracy (±10)': f"{models[location]['acc_10']:.1f}%"
            })
    
    return pd.DataFrame(results)


def load_models(model_file='location_specific_models.pkl'):
    """Load saved models."""
    with open(model_file, 'rb') as f:
        return pickle.load(f)


def get_model_info(models, location):
    """Get detailed info about a specific location's model."""
    
    if location not in models:
        print(f"❌ No model found for location: {location}")
        return
    
    model_data = models[location]
    
    print("\n" + "="*80)
    print(f"MODEL INFO: {location}")
    print("="*80)
    print(f"\nTraining Data:")
    print(f"  Samples Used: {model_data['samples']:,}")
    print(f"  Features: {len(model_data['feature_cols'])}")
    
    print(f"\nModel Performance:")
    print(f"  MAE:  {model_data['mae']:.2f} people")
    print(f"  RMSE: {model_data['rmse']:.2f} people")
    print(f"  R² Score: {model_data['r2']:.4f}")
    print(f"  Accuracy (±10): {model_data['acc_10']:.2f}%")
    
    print(f"\nInterpretation:")
    if model_data['acc_10'] >= 70:
        print("  ✅ Highly accurate predictions for this location")
    elif model_data['acc_10'] >= 50:
        print("  ✅ Good predictions for this location")
    else:
        print("  ⚠️  Moderate accuracy - predictions may vary")


# Example usage
if __name__ == "__main__":
    
    # Train location-specific models
    print("\n🔧 TRAINING LOCATION-SPECIFIC MODELS...")
    models = train_location_specific_models(
        csv_file='location_count_with_start_time.csv',
        min_samples=50  # Need at least 50 samples per location
    )
    
    # Make predictions
    print("\n\n" + "="*80)
    print("MAKING PREDICTIONS")
    print("="*80)
    
    predictions = [
        {'location': 'Hostel', 'start_time': '2025-09-26 13:30:00'},
        {'location': 'Hostel', 'start_time': '2025-09-26 14:00:00'},
        {'location': 'Hostel', 'start_time': '2025-09-26 20:00:00'},
        {'location': 'Library', 'start_time': '2025-09-26 10:00:00'},
        {'location': 'Library', 'start_time': '2025-09-26 18:00:00'},
        {'location': 'Cafeteria', 'start_time': '2025-09-26 12:00:00'},
        {'location': 'Cafeteria', 'start_time': '2025-09-26 19:00:00'},
    ]
    
    results = batch_predict(models, predictions)
    print("\n" + results.to_string(index=False))
    
    # Show detailed info for specific locations
    print("\n\n" + "="*80)
    print("MODEL DETAILS")
    print("="*80)
    
    for loc in ['Hostel', 'Library', 'Cafeteria']:
        get_model_info(models, loc)
    
    # Example: Load models later
    print("\n\n" + "="*80)
    print("LOADING SAVED MODELS")
    print("="*80)
    
    loaded_models = load_models('location_specific_models.pkl')
    print(f"\n✓ Loaded {len(loaded_models)} models")
    print(f"✓ Available locations: {list(loaded_models.keys())}")
    
    # Make a single prediction
    print("\n\nSingle Prediction Example:")
    count = predict_count(loaded_models, 'Hostel', '2025-09-27 15:30:00')
    print(f"  Hostel at 2025-09-27 15:30:00 → {count} people")


🔧 TRAINING LOCATION-SPECIFIC MODELS...
LOCATION-SPECIFIC RANDOM FOREST MODELS
Training separate model for EACH location using only its own data

✓ Loaded 44,947 total records
  Preparing features...
✓ Using 14 features per model

✓ Found 17 locations

TRAINING MODELS...

📍 Admin Lobby
----------------------------------------
  ✓ Training with 2,878 samples from Admin Lobby
  ✓ Test MAE: 2.9 | RMSE: 9.0 | R²: 0.994
  ✓ Accuracy: ±5: 90.1% | ±10: 96.5% | ±20: 99.3%

📍 Auditorium
----------------------------------------
  ✓ Training with 2,869 samples from Auditorium
  ✓ Test MAE: 6.2 | RMSE: 12.8 | R²: 0.999
  ✓ Accuracy: ±5: 73.5% | ±10: 84.1% | ±20: 90.8%

📍 Cafeteria
----------------------------------------
  ✓ Training with 2,859 samples from Cafeteria
  ✓ Test MAE: 9.0 | RMSE: 19.6 | R²: 1.000
  ✓ Accuracy: ±5: 66.3% | ±10: 74.1% | ±20: 84.8%

📍 Faculty Office
----------------------------------------
  ✓ Training with 2,712 samples from Faculty Office
  ✓ Test MAE: 4.5 | RMSE: 13.9

In [27]:
df

Unnamed: 0,start_time,location,count
0,2025-08-27 00:00:00,Admin Lobby,2
1,2025-08-27 00:00:00,Auditorium,3
2,2025-08-27 00:00:00,Hostel,362
3,2025-08-27 00:00:00,LAB_102,1
4,2025-08-27 00:00:00,Lab,2
...,...,...,...
44942,2025-09-25 23:45:00,Auditorium,13
44943,2025-09-25 23:45:00,Admin Lobby,685
44944,2025-09-25 23:45:00,Seminar Room,5
44945,2025-09-25 23:45:00,LAB_101,5




📋 OPTION 3: Single location summary

Generating summary for Hostel on 2025-08-27...
PREPARING DAILY DATA FOR SUMMARY GENERATION

✓ Loaded 44,947 records
✓ Date range: 2025-08-27 to 2025-09-25
✓ Locations: 17


NameError: name 'defaultdict' is not defined