Before you turn in the homework, make sure everything runs as expected. To do so, select **Kernel**$\rightarrow$**Restart & Run All** in the toolbar above.  Remember to submit both on **DataHub** and **Gradescope**.

Please fill in your name and include a list of your collaborators below.

In [None]:
NAME = "Tianxiao Hu"
COLLABORATORS = ""

---

# Project 2: NYC Taxi Rides
# Extras

Put all of your extra work in here. Feel free to save figures to use when completing Part 4.

In [None]:
import os
import pandas as pd
import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sqlalchemy import create_engine
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

sns.set(style="whitegrid", palette="muted")

plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12

%matplotlib inline

In [None]:
# Run this cell to load the data. 
data_file = Path("./", "cleaned_data.hdf")
train_df = pd.read_hdf(data_file, "train")
val_df = pd.read_hdf(data_file, "val")

In [None]:
sns.boxplot(train_df["duration"].values)
plt.title("duration before dropping outliers")

In [None]:
train_df = train_df[np.logical_and(train_df['duration'] < 4000, train_df['duration'] > 50)]

In [None]:
sns.boxplot(train_df["duration"].values)
plt.title("duration before dropping outliers")

In [None]:
# Copied from part 2
def haversine(lat1, lng1, lat2, lng2):
    """
    Compute haversine distance
    """
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    average_earth_radius = 6371
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * average_earth_radius * np.arcsin(np.sqrt(d))
    return h

# Copied from part 2
def manhattan_distance(lat1, lng1, lat2, lng2):
    """
    Compute Manhattan distance
    """
    a = haversine(lat1, lng1, lat1, lng2)
    b = haversine(lat1, lng1, lat2, lng1)
    return a + b

# Copied from part 2
def bearing(lat1, lng1, lat2, lng2):
    """
    Compute the bearing, or angle, from (lat1, lng1) to (lat2, lng2).
    A bearing of 0 refers to a NORTH orientation.
    """
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

# Copied from part 2
def add_time_columns(df):
    """
    Add temporal features to df
    """
    df.is_copy = False # propogate write to original dataframe
    df.loc[:, 'month'] = df['tpep_pickup_datetime'].dt.month
    df.loc[:, 'week_of_year'] = df['tpep_pickup_datetime'].dt.weekofyear
    df.loc[:, 'day_of_month'] = df['tpep_pickup_datetime'].dt.day
    df.loc[:, 'day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek
    df.loc[:, 'hour'] = df['tpep_pickup_datetime'].dt.hour
    df.loc[:, 'week_hour'] = df['tpep_pickup_datetime'].dt.weekday * 24 + df['hour']
    return df

# Copied from part 2
def add_distance_columns(df):
    """
    Add distance features to df
    """
    df.is_copy = False # propogate write to original dataframe
    df.loc[:, 'manhattan'] = manhattan_distance(lat1=df['pickup_latitude'],
                                                lng1=df['pickup_longitude'],
                                                lat2=df['dropoff_latitude'],
                                                lng2=df['dropoff_longitude'])

    df.loc[:, 'bearing'] = bearing(lat1=df['pickup_latitude'],
                                   lng1=df['pickup_longitude'],
                                   lat2=df['dropoff_latitude'],
                                   lng2=df['dropoff_longitude'])
    df.loc[:, 'haversine'] = haversine(lat1=df['pickup_latitude'],
                                   lng1=df['pickup_longitude'],
                                   lat2=df['dropoff_latitude'],
                                   lng2=df['dropoff_longitude'])
    return df

def select_columns(data, *columns):
    return data.loc[:, columns]

In [None]:
def remove_blizzard(df):
    df.is_copy = False
    df = df[df["day_of_month"] != 23]
    return df

def remove_negative(data, *columns):
    data.is_copy = False
    for column in columns:
        median = np.median(data[column].values)
        data[column] = np.where(data[column] < 0, median, data[column])
    return data

def replace_zero(data, *columns):
    data.is_copy = False
    for column in columns:
        median = np.median(data[column].values)
        data[column] = np.where(data[column] == 0, median, data[column])
    return data

def replace_outlier(data, *columns):
    data.is_copy = False
    
    def calculate_outlier_bound(arr):
        q3 = np.percentile(arr, 75)
        q1 = np.percentile(arr, 25)
        iqr = q3 - q1
        upper = q3 + 1.5 * iqr
        lower = q1 - 1.5 * iqr
        return [lower, upper]
    
    for column in columns:
        lower, upper = calculate_outlier_bound(data[column].values)
        median = np.median(data[column].values)
        data[column] = np.where(np.logical_or(data[column].values < lower, data[column].values > upper), median, data[column])
        
    return data

In [None]:
def process_data(data, test=False):
    data_clean = (  data
                    .pipe(add_time_columns)
                    .pipe(remove_blizzard)
                    .pipe(add_distance_columns)
                    .pipe(remove_negative, 
                          'extra', 
                          'mta_tax', 
                          'improvement_surcharge',
                          'fare_amount', 
                          'total_amount')
                    .pipe(replace_zero,
                          'pickup_longitude',
                          'pickup_latitude',
                          'dropoff_longitude',
                          'dropoff_latitude')
                    .pipe(replace_outlier,        
                          'fare_amount', 
                          'total_amount',
                          'duration')
    )
    X = data_clean.pipe(select_columns, 
                      'passenger_count', 
                      'trip_distance', 
                      'pickup_longitude', 
                      'pickup_latitude', 
                      'dropoff_longitude', 
                      'dropoff_latitude', 
                      'day_of_month', 
                      'fare_amount', 
                      'extra', 
                      'mta_tax', 
                      'tip_amount',
                      'tolls_amount', 
                      'improvement_surcharge', 
                      'total_amount',
                      'day_of_week', 
                      'hour', 
                      'week_hour', 
                      'manhattan', 
                      'bearing', 
                      'haversine',
                      'duration')
    
    if test:
        y = None
    else:
        y = X['duration']
        
    X = X.drop('duration', axis=1)
        
    return X, y

In [None]:
X_train, y_train = process_data(train_df)
X_val, y_val = process_data(val_df)

In [None]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error

# Ridge regression
model = Ridge()

alpha = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]
optimized_model = GridSearchCV(model, param_grid={'alpha': alpha}, cv=5)
optimized_model.fit(X_train, y_train)

print("Best Paramaters:")
print(optimized_model.best_params_)
print("Grid Scores:")
means = optimized_model.cv_results_['mean_test_score']
stds = optimized_model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, optimized_model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

y_train_pred = optimized_model.predict(X_train)
print("MAE for Train Data:")
print(mean_absolute_error(y_train, y_train_pred))
y_val_pred = optimized_model.predict(X_val)
print("MAE for Val Data:")
print(mean_absolute_error(y_val, y_val_pred))

In [None]:
alpha = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]
train_mae = []
val_mae = []
for para in alpha:
    model = Lasso(alpha=para)
    model.fit(X_train, y_train)
    train_mae.append(mean_absolute_error(y_train, model.predict(X_train)))
    val_mae.append(mean_absolute_error(y_val, model.predict(X_val)))
    
plt.plot(range(len(train_mae)),train_mae)
plt.plot(range(len(val_mae)),val_mae)
plt.xticks(range(1, len(train_mae)+1), alpha)
plt.title("MAE vs alpha")
plt.legend("train", "val")

## Submission

You're almost done!

Before submitting this assignment, ensure that you have:

1. Restarted the Kernel (in the menubar, select Kernel$\rightarrow$Restart & Run All)
2. Validated the notebook by clicking the "Validate" button.

Then,

1. **Submit** the assignment via the Assignments tab in **Datahub** 
1. **Upload and tag** the manually reviewed portions of the assignment on **Gradescope**