# ML Notebook
## Bailey Snee and Trinity Gahagen

In [54]:
# Basic imports
import pandas as pd
import numpy as np

# sklearn imports?
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_log_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor, Ridge, LinearRegression
from sklearn.pipeline import Pipeline

In [21]:
# import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [22]:
train.head()

Convert datetime column to datetime type and split into components

In [23]:
train['datetime'] = pd.to_datetime(train['datetime'])
train['datetime'].dtype

In [24]:
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour

Drop datetime, atemp, and casual/registered columns.

In [25]:
train = train.drop(columns=['datetime', 'atemp', 'casual', 'registered'])
train.info()

# Model Building

In [58]:
# Splitting data into target/features
X = train.drop(columns='count')
y = train['count']

# Split training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestRegressor()

In [63]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
rf_param = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
    
}

In [41]:
rf_cv = GridSearchCV(rf, rf_param, cv=3, scoring='neg_root_mean_squared_log_error', n_jobs=-1, verbose=10)

In [42]:
rf_cv.fit(X_train_scaled, y_train)

In [None]:
rf_cv.best_params_

In [61]:
rf_cv.best_score_

In [60]:
best_rf_model = rf_cv.best_estimator_

y_pred = best_rf_model.predict(X_test_scaled)

r2_score(y_test, y_pred)

In [43]:
gbr = GradientBoostingRegressor()

In [44]:
gbr_param = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [45]:
gbr_cv = GridSearchCV(gbr, gbr_param, cv=3, scoring='neg_root_mean_squared_log_error', n_jobs=-1, verbose=1)

In [46]:
gbr_cv.fit(X_train_scaled, y_train)

In [50]:
gbr_cv.best_params_

In [51]:
gbr_cv.best_score_

In [62]:
svr = SVR()

In [64]:
svr_param = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.5, 1],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.01, 0.1, 1]
}

In [65]:
svr_cv = GridSearchCV(svr, svr_param, cv=3, scoring='neg_root_mean_squared_log_error', n_jobs=-1, verbose=1)

In [66]:
svr_cv.fit(X_train_scaled, y_train)

In [67]:
svr_cv.best_params_

In [69]:
svr_cv.best_score_

In [71]:
best_svr_model = svr_cv.best_estimator_

y_pred = best_svr_model.predict(X_test_scaled)

r2_score(y_test, y_pred)

0.31315346259306587