In [0]:
pip install mlflow

In [0]:
# import relevant functions
import pandas as pd
from sklearn.model_selection import train_test_split
import boto3

In [0]:
s3 = boto3.client('s3')

In [0]:
# import data
bucket = "columbia-gr5069-main"
drivers_data = "raw/drivers.csv"
races_data = "raw/races.csv"
results_data = "raw/results.csv"

drivers = s3.get_object(Bucket= bucket, Key= drivers_data) 
races = s3.get_object(Bucket= bucket, Key= races_data) 
results = s3.get_object(Bucket= bucket, Key= results_data) 

df_drivers = pd.read_csv(drivers["Body"])
df_races = pd.read_csv(races["Body"])
df_results = pd.read_csv(results["Body"])

In [0]:
display(df_drivers)

In [0]:
display(df_races)

In [0]:
display(df_results)

In [0]:
# merge dataframes 
drivers_results = pd.merge(df_results, df_drivers, on="driverId", how="left")
drivers_results = pd.merge(drivers_results, df_races, on="raceId", how="left")


In [0]:
# calculate age of drivers at time of race
drivers_results['dob'] = pd.to_datetime(drivers_results['dob'])
drivers_results['date'] = pd.to_datetime(drivers_results['date'])
drivers_results['age'] = (drivers_results['date'].dt.year - drivers_results['dob'].dt.year) - ((
    drivers_results['date'].dt.year - drivers_results['dob'].dt.month) < 0)

In [0]:
# one-hot encode nationality
drivers_results = pd.get_dummies(drivers_results, columns=["nationality"])
display(drivers_results)

In [0]:
# display final dataframe
df = drivers_results[["raceId", "driverId", "positionOrder", "age", 
                         "nationality_American", 
                         "nationality_American-Italian", 
                         "nationality_Argentine", 
                         "nationality_Argentine-Italian", 
                         "nationality_Australian", 
                         "nationality_Austrian", 
                         "nationality_Belgian", 
                         "nationality_Brazilian", 
                         "nationality_British", 
                         "nationality_Canadian", 
                         "nationality_Chilean", 
                         "nationality_Chinese", 
                         "nationality_Colombian", 
                         "nationality_Czech", 
                         "nationality_Danish", 
                         "nationality_Dutch", 
                         "nationality_East German", 
                         "nationality_Finnish", 
                         "nationality_French", 
                         "nationality_German", 
                         "nationality_Hungarian", 
                         "nationality_Indian", 
                         "nationality_Indonesian", 
                         "nationality_Irish", 
                         "nationality_Italian", 
                         "nationality_Japanese", 
                         "nationality_Liechtensteiner", 
                         "nationality_Malaysian", 
                         "nationality_Mexican", 
                         "nationality_Monegasque", 
                         "nationality_New Zealander", 
                         "nationality_Polish", 
                         "nationality_Portuguese", 
                         "nationality_Rhodesian", 
                         "nationality_Russian", 
                         "nationality_South African", 
                         "nationality_Spanish", 
                         "nationality_Swedish", 
                         "nationality_Swiss", 
                         "nationality_Thai", 
                         "nationality_Uruguayan", 
                         "nationality_Venezuelan"]]
display(df)

In [0]:
# split data into training and test subsets
y = df["positionOrder"]
X = df.loc[:,["age", 
                "nationality_American", 
                "nationality_American-Italian", 
                "nationality_Argentine", 
                "nationality_Argentine-Italian", 
                "nationality_Australian", 
                "nationality_Austrian", 
                "nationality_Belgian", 
                "nationality_Brazilian", 
                "nationality_British", 
                "nationality_Canadian", 
                "nationality_Chilean", 
                "nationality_Chinese", 
                "nationality_Colombian", 
                "nationality_Czech", 
                "nationality_Danish", 
                "nationality_Dutch", 
                "nationality_East German", 
                "nationality_Finnish", 
                "nationality_French", 
                "nationality_German", 
                "nationality_Hungarian", 
                "nationality_Indian", 
                "nationality_Indonesian", 
                "nationality_Irish", 
                "nationality_Italian", 
                "nationality_Japanese", 
                "nationality_Liechtensteiner", 
                "nationality_Malaysian", 
                "nationality_Mexican", 
                "nationality_Monegasque", 
                "nationality_New Zealander", 
                "nationality_Polish", 
                "nationality_Portuguese", 
                "nationality_Rhodesian", 
                "nationality_Russian", 
                "nationality_South African", 
                "nationality_Spanish", 
                "nationality_Swedish", 
                "nationality_Swiss", 
                "nationality_Thai", 
                "nationality_Uruguayan", 
                "nationality_Venezuelan"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=15)

In [0]:
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [0]:
with mlflow.start_run(run_name="Basic RF Experiment") as run:
  # Create model, train it, and create predictions
  rf = RandomForestRegressor()
  rf.fit(X_train, y_train)
  predictions = rf.predict(X_test)
  
  # Log model
  mlflow.sklearn.log_model(rf, "random-forest-model")
  
  # Create metrics
  mse = mean_squared_error(y_test, predictions)
  print("  mse: {}".format(mse))
  
  # Log metrics
  mlflow.log_metric("mse", mse)
  
  runID = run.info.run_uuid
  experimentID = run.info.experiment_id
  
  print("Inside MLflow Run with run_id {} and experiment_id {}".format(runID, experimentID))