In [None]:
# author: Arlin Cherian, Michelle Wang
# date: 2021-11-24

"Fits a Linear Regression Ridge Model and Random Forest Regressor model on the pre-processed training data on coffee quality rating and saves the model as a rds file.
Usage: src/fit_coffee_quality_rating_models.py --train=<train> --test=<test> --out_dir=<out_dir>
  
Options:
--train=<train>     Path (including filename) to training data in csv format
--test=<test>       Path (including filename) to testing data in csv format
--out_dir=<out_dir> Path to directory where the serialized model should be written
" -> doc

import os
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
from docopt import docopt
from sklearn import datasets
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from scipy.stats import loguniform
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)


set.seed(2019)

Opt = docopt(__doc__)
train = "data/"
test = "data/"
out_dir = "results"

def main(train, test, out_dir):
    train_df = pd.read_csv(train)
    test_df = pd.read_csv(test)
    
#splits into x and y
    
    X_train = train_df.drop(columns=["total_cup_points"])
    X_test = test_df.drop(columns=["total_cup_points"])

    y_train = train_df["total_cup_points"]
    y_test = test_df["total_cup_points"]
    
    #creates preprocessor
    
    numeric_features = [
    "moisture",
    "quakers",
    "altitude_mean_meters"
    ]
    categorical_features = [
        "country_of_origin",
        "harvest_year",
        "variety",
        "processing_method",
        "category_one_defects",
        "color",
        "category_two_defects"
        "region"
        ]

    preprocessor = make_column_transformer(
        (StandardScaler(), numeric_features),
        (OneHotEncoder(handle_unknown="ignore", sparse=False), categorical_features)
    )
    # Building regression models
    models = {
    "Ridge": Ridge(),
    "RForest_Regressor": RandomForestRegressor()
    }
    
    results_dict = {}

    for k, v in models.items():
        pipe_multi = make_pipeline(preprocessor, v)
        models_score = mean_std_cross_val_scores(pipe_multi, X_train, y_train, cv=5, return_train_score=True)
        results_dict[k] = models_score
        pd.DataFrame.from_dict(results_dict)
 
    results_dict 
    
    # Hyperparameter Optimization
    param_dist = {"ridge__alpha": loguniform(1e-3, 1e3)}

    pipe = make_pipeline(preprocessor, Ridge())

    random_search = RandomizedSearchCV(
    pipe, param_distributions=param_dist, n_jobs=-1, n_iter=10, cv=5, random_state=123, return_train_score=True)
    random_search.fit(X_train, y_train);
    
    # 
    
if __name == "__main__":
main(opt["--train"], opt["--test"], opt["--out_dir"])