# Kaggle, the simple way

In [None]:
import pandas as pd

train = pd.read_csv('./data/train.csv')

train.head()

In [None]:
test = pd.read_csv('./data/test.csv')
test.head()

In [None]:
submission = pd.read_csv('./data/gender_submission.csv')
submission.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier

y = train['Survived']
X = train[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

model = DecisionTreeClassifier()

In [None]:
model.fit(X, y)

In [None]:
train.info()

In [None]:
columns = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
X = train[columns]

In [None]:
model.fit(X, y)

In [None]:
X.info()

In [None]:
X['Age'].fillna(X['Age'].mean(), inplace=True)
X.info()

In [None]:
model.fit(X, y)

In [None]:
model.predict(test[columns])

In [None]:
test['Age'].fillna(test['Age'].mean(), inplace=True)

In [None]:
model.predict(test[columns])

In [None]:
test[columns].info()

In [None]:
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [None]:
model.predict(test[columns])

In [None]:
y_predicted = model.predict(test[columns])

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_predicted})
output

In [None]:
output.to_csv('./upload.csv', index=False)

![a](./img/here_to_help_2x.png)

Source: https://xkcd.com/1831/

# Enter Automated ML

### Requirements:
- An Azure account
- A _Resource Group_ with a _Machine Learning Service Workspace_


In [None]:
import azureml.core

print("Azure ML SDK Version: ", azureml.core.VERSION)

In [None]:
# Authenticate if we haven't already

from azureml.core.authentication import InteractiveLoginAuthentication

credentials = InteractiveLoginAuthentication()

In [None]:
# Load the workspace info

from azureml.core import Workspace

ws = Workspace.from_config(auth=credentials)

In [None]:
# Create a new experiment if we haven't already

from azureml.core import Experiment
exp = Experiment(workspace=ws, name='Titanic')

In [None]:
import logging
from azureml.train.automl import AutoMLConfig

# See possible options here - https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric
config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=3,
    training_data=train,
    label_column_name='Survived',
    n_cross_validations=5,
    iteration_timeout_minutes=1,
    preprocess=True,
    verbosity=logging.INFO,
    model_explainability=False
)

# ~1.5 minutes to run
local_run = exp.submit(config, show_output=True)

In [None]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

In [None]:
best_run, fitted_model = local_run.get_output()
print('Best run:', best_run)
print('-----')
print('Best model:', fitted_model)

In [None]:
def generate_submission_file(model, X_test):    
    output = pd.DataFrame({
        'PassengerId': X_test['PassengerId'], 
        'Survived': model.predict(X_test)
    })
    output.to_csv('./upload.csv', index=False)

In [None]:
generate_submission_file(fitted_model, pd.read_csv('./data/test.csv'))    

## Analyze the trained model

In [None]:
fitted_model.named_steps

In [None]:
transformer = fitted_model.named_steps['datatransformer']
scaler = fitted_model.named_steps['StandardScalerWrapper']
classifier = fitted_model.named_steps['LightGBMClassifier']    

In [None]:
transformer
# type
# help
# dir

In [None]:
scaler.model

In [None]:
classifier.model

In [None]:
transformer.get_engineered_feature_names()

In [None]:
len(transformer.get_engineered_feature_names())

In [None]:
transformer.get_featurization_summary()

In [None]:
test = pd.read_csv('./data/test.csv')
test.tail(10)

In [None]:
test_preprocessed = transformer.transform(test)

test_preprocessed

In [None]:
pd.DataFrame(test_preprocessed.toarray(), columns=transformer.get_engineered_feature_names())

In [None]:
test_scaled = scaler.transform(test_preprocessed)

test_scaled

In [None]:
pd.DataFrame(test_scaled.toarray(), columns=transformer.get_engineered_feature_names())

In [None]:
results = classifier.predict(test_scaled)

results

## More iterations

In [None]:
# ~7 minutes to run

config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=15,
    training_data=train,
    label_column_name='Survived',
    n_cross_validations=5,
    preprocess=True,
    verbosity=logging.INFO,
    model_explainability=False
)

local_run = exp.submit(config, show_output=True)

In [None]:
best_run, fitted_model = local_run.get_output()

fitted_model    

In [None]:
fitted_model.named_steps

In [None]:
classifier = fitted_model.named_steps['prefittedsoftvotingclassifier']
classifier

In [None]:
classifier.__module__

In [None]:
classifier.estimators

In [None]:
classifier.weights

In [None]:
X_test = pd.read_csv('./data/test.csv')
generate_submission_file(fitted_model, X_test)

## Using explainability

In [None]:
config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=5,
    training_data=train,
    label_column_name='Survived',
    n_cross_validations=5,
    preprocess=True,
    model_explainability=True
)

run_exp = exp.submit(config=config, show_output=True)

In [None]:
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient

best_run, fitted_model = run_exp.get_output()

client = ExplanationClient.from_run(best_run)

In [None]:
dir(client)

In [None]:
explanations = client.download_model_explanation(raw=False)

In [None]:
dir(explanations)

In [None]:
explanations.get_feature_importance_dict()

In [None]:
explanations.visualize()

In [None]:
train['Name'].str.extract(' ([A-Za-z]+)\.', expand=True)

## Lessons learned

In [None]:
def process(df: pd.DataFrame):
    frame = df.copy()
    frame['HasNoSibSp'] = frame['SibSp'] == 0
    frame['HasNoParch'] = frame['Parch'] == 0
    
    frame['SibSp'] = frame['SibSp'].astype('float64')
    frame['Parch'] = frame['Parch'].astype('float64')
    frame['Pclass'] = frame['Pclass'].astype('category')

    frame['Title'] = frame['Name'].str.extract(' ([A-Za-z]+)\.', expand=True)
    frame['Title'] = frame['Title'].replace('Mlle', 'Miss')
    frame['Title'] = frame['Title'].replace('Ms', 'Miss')
    frame['Title'] = frame['Title'].replace('Mme', 'Mrs')
    
    frame.drop(columns=['PassengerId'], inplace=True)
    
    return frame

In [None]:
train_df = process(train)

from azureml.train.automl import AutoMLConfig

config = AutoMLConfig(
    task='classification',
    primary_metric='accuracy',
    iterations=15,
    training_data=train_df,
    label_column_name='Survived',
    n_cross_validations=5,
    preprocess=True,
    model_explainability=True,
)

run_simplified = exp.submit(config=config, show_output=True)

In [None]:
best_run, fitted_model = run_simplified.get_output()

test = pd.read_csv('./data/test.csv')
test_df = process(test)

output = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': fitted_model.predict(test_df)
})

output.to_csv('./automl_simplified.csv', index=False)

In [None]:
client = ExplanationClient.from_run(best_run)
explanations = client.download_model_explanation(raw=False)
explanations.get_feature_importance_dict()    

# The End