# Restaurant Inspection & Yelp Review Analysis

This notebook demonstrates the complete workflow for analyzing restaurant inspection data and Yelp reviews using machine learning models.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import custom modules
import sys
sys.path.append('../src')
from preprocess import clean_inspection_data, clean_text, match_inspection_to_reviews
from modeling import ModelTrainer, split_data
from evaluation import evaluate_all_models, plot_confusion_matrix, compare_models

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print('Libraries imported successfully!')

## 2. Load Data

In [None]:
# Load inspection data
df_inspection = pd.read_csv('../data/Final_Inspection_Data.csv')

# Load recent inspection data
df_recent = pd.read_csv('../data/RecentInspDate.csv')

print(f'Inspection data shape: {df_inspection.shape}')
print(f'Recent inspection data shape: {df_recent.shape}')

df_inspection.head()

## 3. Exploratory Data Analysis

In [None]:
# Display basic statistics
print('\nData Info:')
df_inspection.info()

print('\nNumerical Summary:')
df_inspection.describe()

In [None]:
# Visualize grade distribution
if 'GRADE' in df_inspection.columns:
    plt.figure(figsize=(10, 6))
    df_inspection['GRADE'].value_counts().plot(kind='bar')
    plt.title('Distribution of Restaurant Grades')
    plt.xlabel('Grade')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.savefig('../figures/grade_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Visualize score distribution
if 'SCORE' in df_inspection.columns:
    plt.figure(figsize=(10, 6))
    df_inspection['SCORE'].dropna().hist(bins=30, edgecolor='black')
    plt.title('Distribution of Inspection Scores')
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig('../figures/score_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()

## 4. Data Preprocessing

In [None]:
# Clean violation descriptions
if 'VIOLATION DESCRIPTION' in df_inspection.columns:
    df_inspection['violation_clean'] = df_inspection['VIOLATION DESCRIPTION'].apply(clean_text)
    print('Text cleaning completed!')

## 5. Model Training

In [None]:
# Prepare data for modeling
# Filter data with grades
df_model = df_inspection[df_inspection['GRADE'].notna()].copy()

# Create binary classification: A vs Not A
df_model['grade_binary'] = (df_model['GRADE'] == 'A').astype(int)

print(f'Modeling dataset shape: {df_model.shape}')
print(f'\nClass distribution:')
print(df_model['grade_binary'].value_counts())

In [None]:
# Initialize model trainer
trainer = ModelTrainer(random_state=42)

# Prepare features
X_text = df_model['violation_clean'].fillna('')
y = df_model['grade_binary']

# Vectorize text
X = trainer.prepare_features(X_text)

print(f'Feature matrix shape: {X.shape}')

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=42)

print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')

In [None]:
# Train all models
models = trainer.train_all_models(X_train, y_train)

## 6. Model Evaluation

In [None]:
# Evaluate all models
results = evaluate_all_models(models, X_test, y_test)

In [None]:
# Compare models
df_comparison = compare_models(results)

In [None]:
# Plot confusion matrices
for name, model in models.items():
    y_pred = model.predict(X_test)
    plot_confusion_matrix(
        y_test, 
        y_pred, 
        labels=['Not A', 'A'],
        title=f'Confusion Matrix - {name.replace("_", " ").title()}',
        save_path=f'../figures/confusion_matrix_{name}.png'
    )

## 7. Results Summary

In [None]:
print('\n' + '='*80)
print('FINAL RESULTS SUMMARY')
print('='*80)
print(df_comparison.to_string())
print('\nBest performing model:', df_comparison.index[0])
print(f'F1 Score: {df_comparison.iloc[0]["f1_score"]:.4f}')