In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.seasonal import seasonal_decompose
import os
import warnings

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
print("Libraries imported successfully.")

# --- Create output directory ---
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory '{output_dir}' is ready.")


def create_lagged_features(data, window_size):
    """
    Transforms a time series into a supervised learning dataset.
    """
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)


# 2. DATA LOADING AND PREPARATION
file_names = [
    'uber-raw-data-apr14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-jun14.csv',
    'uber-raw-data-jul14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-sep14.csv'
]

# Read and concatenate all files into a single dataframe
try:
    uber_2014 = pd.concat((pd.read_csv(f) for f in file_names), ignore_index=True)
    print(f"Successfully loaded and combined {len(file_names)} data files.")
except FileNotFoundError:
    print("Error: Make sure all 6 'uber-raw-data-...14.csv' files are in the same directory as the script.")
    exit()


uber_2014['Date/Time'] = pd.to_datetime(uber_2014['Date/Time'], format='%m/%d/%Y %H:%M:%S')

# Set 'Date/Time' as the index
uber_2014.set_index('Date/Time', inplace=True)
uber_2014.sort_index(inplace=True)

# Resample data into hourly trip counts, as done in the PDF
hourly_counts = uber_2014['Base'].resample('h').count().reset_index()
hourly_counts.columns = ['Date', 'Count'] # This line will now work correctly
hourly_counts.set_index('Date', inplace=True)

# --- Save prepared data ---
hourly_counts.to_csv(os.path.join(output_dir, "hourly_trip_counts.csv"))
print(f"Prepared hourly data saved to '{output_dir}/hourly_trip_counts.csv'")
print(hourly_counts.head())


# 3. EDA AND TRAIN/TEST SPLIT
decomposition = seasonal_decompose(hourly_counts['Count'], model='additive', period=24)
fig = decomposition.plot()
fig.set_size_inches(14, 8)
plt.suptitle("Time Series Decomposition of Hourly Uber Trips", y=0.92)
# --- Save plot ---
plt.savefig(os.path.join(output_dir, "time_series_decomposition.png"), bbox_inches='tight')
print(f"Decomposition plot saved to '{output_dir}/time_series_decomposition.png'")
plt.show()

cutoff_date = '2014-09-15 00:00:00'
train_data = hourly_counts.loc[:cutoff_date]
test_data = hourly_counts.loc[cutoff_date:]

print(f"Data split at {cutoff_date}. Training samples: {len(train_data)}, Testing samples: {len(test_data)}")

# Plotting the train/test split
plt.figure(figsize=(15, 5))
plt.plot(train_data.index, train_data['Count'], label='Training Set')
plt.plot(test_data.index, test_data['Count'], label='Test Set', color='orange')
plt.title('Uber Trips: Train / Test Split')
plt.ylabel('Number of Trips')
plt.xlabel('Date')
plt.legend()
# --- Save plot ---
plt.savefig(os.path.join(output_dir, "train_test_split.png"), bbox_inches='tight')
print(f"Train/Test split plot saved to '{output_dir}/train_test_split.png'")
plt.show()

# 4. FEATURE ENGINEERING AND MODEL TRAINING
window_size = 24
X_train, y_train = create_lagged_features(train_data['Count'].values, window_size)
test_features_base = np.concatenate([train_data['Count'].values[-window_size:], test_data['Count'].values])
X_test, y_test = create_lagged_features(test_features_base, window_size)

models = {
    "XGBoost": xgb.XGBRegressor(
        objective='reg:squarederror', colsample_bytree=1.0, learning_rate=0.1, max_depth=6,
        n_estimators=300, subsample=0.6, random_state=12345
    [cite_start]), # [cite: 486, 488]
    "Random Forest": RandomForestRegressor(
        n_estimators=100, max_depth=30, max_features=None, min_samples_leaf=2,
        min_samples_split=5, random_state=12345
    [cite_start]), # [cite: 556]
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.1, max_depth=5, max_features='sqrt',
        min_samples_leaf=1, min_samples_split=5, random_state=12345
    [cite_start]) # [cite: 608]
}

predictions = {}
mape_scores = {}

for name, model in models.items():
    print(f"\nTraining {name} model...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    predictions[name] = preds
    mape = mean_absolute_percentage_error(y_test, preds)
    mape_scores[name] = mape
    print(f"{name} MAPE: {mape:.2%}")

# 5. ENSEMBLE MODEL AND FINAL EVALUATION
weights = np.array([0.368, 0.322, 0.310])
ensemble_preds = (
    weights[0] * predictions['XGBoost'] +
    weights[1] * predictions['Random Forest'] +
    weights[2] * predictions['Gradient Boosting']
)
predictions['Ensemble'] = ensemble_preds
mape_scores['Ensemble'] = mean_absolute_percentage_error(y_test, ensemble_preds)
[cite_start]print(f"\nEnsemble Model MAPE: {mape_scores['Ensemble']:.2%}") # [cite: 712]


# --- Final Results Summary ---
print("\n--- Model Performance Summary ---")
summary_df = pd.DataFrame.from_dict(mape_scores, orient='index', columns=['MAPE'])
summary_df['MAPE'] = summary_df['MAPE'].apply(lambda x: f"{x:.2%}")
# --- Save summary data ---
summary_df.sort_values('MAPE').to_csv(os.path.join(output_dir, "model_performance_summary.csv"))
print(f"Model performance summary saved to '{output_dir}/model_performance_summary.csv'")
print(summary_df.sort_values('MAPE'))

# 6. VISUALIZATION OF ALL MODEL PREDICTIONS

plt.figure(figsize=(18, 8))
# Plot the actual test data
plt.plot(test_data.index, test_data['Count'], label='Test (Actual Trips)', color='black', linewidth=2)

# Plot model predictions
colors = {'XGBoost': 'red', 'Random Forest': 'green', 'Gradient Boosting': 'orange', 'Ensemble': 'purple'}
for name, preds in predictions.items():
    plt.plot(test_data.index[window_size:], preds, label=f'{name} Predictions', linestyle='--', color=colors.get(name))

plt.title('Uber 2014 Trips: All Models Predictions vs Test Data', fontsize=16)
plt.ylabel('Number of Trips')
plt.xlabel('Date')
plt.legend()
plt.xticks(rotation=30, ha='right')
# --- Save plot ---
plt.savefig(os.path.join(output_dir, "all_model_predictions.png"), bbox_inches='tight')
print(f"Final predictions plot saved to '{output_dir}/all_model_predictions.png'")
plt.show()

SyntaxError: invalid syntax (2340370859.py, line 156)

In [None]:
# ===================================================================
# 1. SETUP: IMPORT LIBRARIES AND DEFINE HELPER FUNCTIONS
# ===================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.seasonal import seasonal_decompose
import os
import warnings

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")
print("Libraries imported successfully.")

# --- Create output directory ---
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)
print("Output directory '{}' is ready.".format(output_dir))

# Helper function to create lagged features for time-series forecasting
def create_lagged_features(data, window_size):
    """
    Transforms a time series into a supervised learning dataset.
    """
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)


# ===================================================================
# 2. DATA LOADING AND PREPARATION
# ===================================================================
# List of the 2014 data files mentioned in the PDF
file_names = [
    'uber-raw-data-apr14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-jun14.csv',
    'uber-raw-data-jul14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-sep14.csv'
]

# Read and concatenate all files into a single dataframe
try:
    uber_2014 = pd.concat((pd.read_csv(f) for f in file_names), ignore_index=True)
    print("Successfully loaded and combined {} data files.".format(len(file_names)))
except FileNotFoundError:
    print("Error: Make sure all 6 'uber-raw-data-...14.csv' files are in the same directory as the script.")
    exit()

# Convert 'Date/Time' to datetime objects
uber_2014['Date/Time'] = pd.to_datetime(uber_2014['Date/Time'], format='%m/%d/%Y %H:%M:%S')

# Set 'Date/Time' as the index
uber_2014.set_index('Date/Time', inplace=True)
uber_2014.sort_index(inplace=True)

# Resample data into hourly trip counts
hourly_counts = uber_2014['Base'].resample('h').count().reset_index()
hourly_counts.columns = ['Date', 'Count']
hourly_counts.set_index('Date', inplace=True)

# --- Save prepared data ---
hourly_counts.to_csv(os.path.join(output_dir, "hourly_trip_counts.csv"))
print("Prepared hourly data saved to '{}/hourly_trip_counts.csv'".format(output_dir))
print(hourly_counts.head())


# ===================================================================
# 3. EDA AND TRAIN/TEST SPLIT
# ===================================================================
# Perform seasonal decomposition
decomposition = seasonal_decompose(hourly_counts['Count'], model='additive', period=24)
fig = decomposition.plot()
fig.set_size_inches(14, 8)
plt.suptitle("Time Series Decomposition of Hourly Uber Trips", y=0.92)
plt.savefig(os.path.join(output_dir, "time_series_decomposition.png"), bbox_inches='tight')
print("Decomposition plot saved to '{}/time_series_decomposition.png'".format(output_dir))
plt.show()

# Split data
cutoff_date = '2014-09-15 00:00:00'
train_data = hourly_counts.loc[:cutoff_date]
test_data = hourly_counts.loc[cutoff_date:]

print("Data split at {}. Training samples: {}, Testing samples: {}".format(cutoff_date, len(train_data), len(test_data)))

# Plotting the train/test split
plt.figure(figsize=(15, 5))
plt.plot(train_data.index, train_data['Count'], label='Training Set')
plt.plot(test_data.index, test_data['Count'], label='Test Set', color='orange')
plt.title('Uber Trips: Train / Test Split')
plt.ylabel('Number of Trips')
plt.xlabel('Date')
plt.legend()
plt.savefig(os.path.join(output_dir, "train_test_split.png"), bbox_inches='tight')
print("Train/Test split plot saved to '{}/train_test_split.png'".format(output_dir))
plt.show()


# ===================================================================
# 4. FEATURE ENGINEERING AND MODEL TRAINING
# ===================================================================
window_size = 24
X_train, y_train = create_lagged_features(train_data['Count'].values, window_size)
test_features_base = np.concatenate([train_data['Count'].values[-window_size:], test_data['Count'].values])
X_test, y_test = create_lagged_features(test_features_base, window_size)

models = {
    "XGBoost": xgb.XGBRegressor(
        objective='reg:squarederror', colsample_bytree=1.0, learning_rate=0.1, max_depth=6,
        n_estimators=300, subsample=0.6, random_state=12345
    ),
    "Random Forest": RandomForestRegressor(
        n_estimators=100, max_depth=30, max_features=None, min_samples_leaf=2,
        min_samples_split=5, random_state=12345
    ),
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.1, max_depth=5, max_features='sqrt',
        min_samples_leaf=1, min_samples_split=5, random_state=12345
    )
}

predictions = {}
mape_scores = {}

for name, model in models.items():
    print("\nTraining {} model...".format(name))
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    predictions[name] = preds
    mape = mean_absolute_percentage_error(y_test, preds)
    mape_scores[name] = mape
    print("{} MAPE: {:.2%}".format(name, mape))


# ===================================================================
# 5. ENSEMBLE MODEL AND FINAL EVALUATION
# ===================================================================
weights = np.array([0.368, 0.322, 0.310])
ensemble_preds = (
    weights[0] * predictions['XGBoost'] +
    weights[1] * predictions['Random Forest'] +
    weights[2] * predictions['Gradient Boosting']
)
predictions['Ensemble'] = ensemble_preds
mape_scores['Ensemble'] = mean_absolute_percentage_error(y_test, ensemble_preds)
print("\nEnsemble Model MAPE: {:.2%}".format(mape_scores['Ensemble']))


# --- Final Results Summary ---
print("\n--- Model Performance Summary ---")
summary_df = pd.DataFrame.from_dict(mape_scores, orient='index', columns=['MAPE'])
summary_df['MAPE'] = summary_df['MAPE'].apply(lambda x: "{:.2%}".format(x))
summary_df.sort_values('MAPE').to_csv(os.path.join(output_dir, "model_performance_summary.csv"))
print("Model performance summary saved to '{}/model_performance_summary.csv'".format(output_dir))
print(summary_df.sort_values('MAPE'))


# ===================================================================
# 6. VISUALIZATION OF ALL MODEL PREDICTIONS
# ===================================================================
plt.figure(figsize=(18, 8))
plt.plot(test_data.index, test_data['Count'], label='Test (Actual Trips)', color='black', linewidth=2)

colors = {'XGBoost': 'red', 'Random Forest': 'green', 'Gradient Boosting': 'orange', 'Ensemble': 'purple'}
for name, preds in predictions.items():
    plt.plot(test_data.index[window_size:], preds, label='{} Predictions'.format(name), linestyle='--', color=colors.get(name))

plt.title('Uber 2014 Trips: All Models Predictions vs Test Data', fontsize=16)
plt.ylabel('Number of Trips')
plt.xlabel('Date')
plt.legend()
plt.xticks(rotation=30, ha='right')
plt.savefig(os.path.join(output_dir, "all_model_predictions.png"), bbox_inches='tight')
print("Final predictions plot saved to '{}/all_model_predictions.png'".format(output_dir))
plt.show()

Libraries imported successfully.
Output directory 'outputs' is ready.
Error: Make sure all 6 'uber-raw-data-...14.csv' files are in the same directory as the script.


NameError: name 'uber_2014' is not defined

: 