# Train from data in Cloud Storage: data.py


In [5]:
from google.cloud import storage


def get_data_using_pandas(line_count):

    # get data from aws s3
    # url = "s3://wagon-public-datasets/taxi-fare-train.csv"
    # df = pd.read_csv(url, nrows=100)

    # load n lines from my csv
    df = pd.read_csv("gs://lewagon_batch_869_thierry/data/train_1k.csv", nrows=line_count)
    return df


def get_data_using_blob(line_count):

    # get data from aws s3
    # url = "s3://wagon-public-datasets/taxi-fare-train.csv"

    # get data from my google storage bucket
    BUCKET_NAME = "lewagon_batch_869_thierry"
    BUCKET_TRAIN_DATA_PATH = "data/train_1k.csv"

    data_file = "train_1k.csv"

    client = storage.Client()  # verifies $GOOGLE_APPLICATION_CREDENTIALS

    bucket = client.bucket(BUCKET_NAME)

    blob = bucket.blob(BUCKET_TRAIN_DATA_PATH)

    blob.download_to_filename(data_file)

    # load downloaded data to dataframe
    df = pd.read_csv(data_file, nrows=line_count)

    return df

In [7]:
!ls

'gcp boilerplate.ipynb'   train_1k.csv


In [4]:
import pandas as pd
get_data_using_pandas(10)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
5,2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
8,2012-12-03 13:10:00.000000125,9.0,2012-12-03 13:10:00 UTC,-74.006462,40.726713,-73.993078,40.731628,1
9,2009-09-02 01:11:00.00000083,8.9,2009-09-02 01:11:00 UTC,-73.980658,40.733873,-73.99154,40.758138,2


In [6]:
get_data_using_blob(10)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
5,2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
8,2012-12-03 13:10:00.000000125,9.0,2012-12-03 13:10:00 UTC,-74.006462,40.726713,-73.993078,40.731628,1
9,2009-09-02 01:11:00.00000083,8.9,2009-09-02 01:11:00 UTC,-73.980658,40.733873,-73.99154,40.758138,2


# Save trained model to Cloud Storage: data.py

In [None]:
def save_model_to_gcp():

    BUCKET_NAME = "le-wagon-data"
    storage_location = "models/random_forest_model.joblib"
    local_model_filename = "model.joblib"

    client = storage.Client()

    bucket = client.bucket(BUCKET_NAME)

    blob = bucket.blob(storage_location)

    blob.upload_from_filename(local_model_filename)

# Train in the AI Platform

## Makefile

In [None]:
# bucket
BUCKET_NAME=le-wagon-data

# training folder
BUCKET_TRAINING_FOLDER=trainings

# training params
REGION=europe-west1

# app environment
PYTHON_VERSION=3.7
FRAMEWORK=scikit-learn
RUNTIME_VERSION=2.2

# package params
PACKAGE_NAME=taxifare
FILENAME=trainer

##### Job - - - - - - - - - - - - - - - - - - - - - - - - -

JOB_NAME=taxi_fare_training_$(shell date +'%Y%m%d_%H%M%S')

gcp_submit_training:
	gcloud ai-platform jobs submit training ${JOB_NAME} \
		--job-dir gs://${BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
		--package-path ${PACKAGE_NAME} \
		--module-name ${PACKAGE_NAME}.${FILENAME} \
		--python-version=${PYTHON_VERSION} \
		--runtime-version=${RUNTIME_VERSION} \
		--region ${REGION} \
		--stream-logs

## MANIFEST.in

In [None]:
include requirements.txt
graft taxifare
global-exclude *.py[cod] __pycache__ *.so *.ipynb

In [17]:
class Trainer():
    
    def __init__(self,example, *args, **kwargs):
        
        self.args = args
        self.kwargs = kwargs
        self.example = example

In [18]:
trainer = Trainer('hello', 'how are you', 34, name='Adam', lastname = 'Guati')

In [19]:
trainer.example

'hello'

In [20]:
trainer.args

('how are you', 34)

In [31]:
print(trainer.kwargs.get('uba'))

None


In [28]:
trainer.kwargs.keys()

dict_keys(['name', 'lastname'])

In [33]:
params = dict(nrows=10000,
              upload=True,
              local=False,  # set to False to get data from GCP (Storage or BigQuery)
              gridsearch=False,
              optimize=True,
              estimator="xgboost",
              mlflow=True,  # set to True to log params to mlflow
              experiment_name='Lewagon experiment',
              pipeline_memory=None, # None if no caching and True if caching expected
              params: { }
              n_jobs=-1) # Try with njobs=1 and njobs = -1

In [None]:
TPOT

In [35]:
from xgboost import XGBRegressor

In [36]:
XGBRegressor()

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

In [38]:
from sklearn.linear_model import LinearRegression

In [39]:
LinearRegression()

LinearRegression()

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
RandomForestRegressor()

RandomForestRegressor()

In [34]:
params

{'nrows': 10000,
 'upload': True,
 'local': False,
 'gridsearch': False,
 'optimize': True,
 'estimator': 'xgboost',
 'mlflow': True,
 'experiment_name': 'Lewagon experiment',
 'pipeline_memory': None,
 'distance_type': 'manhattan',
 'feateng': ['distance_to_center',
  'direction',
  'distance',
  'time_features',
  'geohash'],
 'n_jobs': -1}

In [None]:
#Preprocessing
#TPOT

#Baselinemodel
#GridSearch Model
