# Kaggle, the easy way

In [None]:
import pandas as pd

train = pd.read_csv('./data/train.csv')

train.head()

In [None]:
test = pd.read_csv('./data/test.csv')
test.head()

In [None]:
submission = pd.read_csv('./data/gender_submission.csv')
submission.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier

y = train['Survived']
X = train[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

model = DecisionTreeClassifier()

In [None]:
model.fit(X, y)

In [None]:
columns = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
X = train[columns]

In [None]:
model.fit(X, y)

In [None]:
X.info()

In [None]:
X['Age'].fillna(X['Age'].mean(), inplace=True)
X.info()

In [None]:
model.fit(X, y)

In [None]:
model.predict(test[columns])

In [None]:
test['Age'].fillna(test['Age'].mean(), inplace=True)

In [None]:
model.predict(test[columns])

In [None]:
test[columns].info()

In [None]:
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [None]:
model.predict(test[columns])

In [None]:
y_predicted = model.predict(test[columns])

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_predicted})
output

In [None]:
output.to_csv('./upload.csv', index=False)

# Enter Azure

In [None]:
import azureml.core

print("Azure ML SDK Version: ", azureml.core.VERSION)

In [None]:
# Authenticate if we haven't already

from azureml.core.authentication import InteractiveLoginAuthentication

credentials = InteractiveLoginAuthentication()

In [None]:
# Load the workspace info

from azureml.core import Workspace

ws = Workspace.from_config(auth=credentials)

In [None]:
# Create a new experiment if we haven't already

from azureml.core import Experiment
exp = Experiment(workspace=ws, name='Kaggle')

In [None]:
df = pd.read_csv('./data/train.csv')

y = df.pop('Survived')
X = df

In [None]:
X.info()

In [None]:
import logging
from azureml.train.automl import AutoMLConfig

# See possible options here - https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric


config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=3,
    X=X,
    y=y,
    n_cross_validations=5,
    preprocess=True,
    verbosity=logging.INFO,
    model_explainability=False
)

local_run = exp.submit(config, show_output=True)

In [None]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

In [None]:
best_run, fitted_model = local_run.get_output()
print('Best run:', best_run)
print('-----')
print('Best model:', fitted_model)

In [None]:
def generate_submission_file(model, X_test):    
    y_predicted = model.predict(X_test)
    
    output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_predicted})    
    output.to_csv('./upload.csv', index=False)

In [None]:
generate_submission_file(fitted_model, pd.read_csv('./data/test.csv'))    

## Analyze the trained model

In [None]:
transformer = fitted_model.named_steps['datatransformer']
scaler = fitted_model.named_steps['MaxAbsScaler']
classifier = fitted_model.named_steps['LightGBMClassifier']    

In [None]:
transformer

In [None]:
scaler

In [None]:
classifier

In [None]:
transformer.get_engineered_feature_names()

In [None]:
len(transformer.get_engineered_feature_names())

In [None]:
transformer.get_featurization_summary()

In [None]:
test = pd.read_csv('./data/test.csv')
test.info()

In [None]:
test_preprocessed = transformer.transform(test)

test_preprocessed

In [None]:
pd.DataFrame(test_preprocessed.toarray(), columns=transformer.get_engineered_feature_names())

In [None]:
test.tail(10)

In [None]:
test_scaled = scaler.transform(test_preprocessed)

test_scaled

In [None]:
pd.DataFrame(test_scaled.toarray(), columns=transformer.get_engineered_feature_names())

In [None]:
results = classifier.predict(test_scaled)

results

## More iterations, no more PassengerId

In [None]:
X = X.drop(columns=['PassengerId'])

config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=10,
    X=X,
    y=y,
    n_cross_validations=5,
    preprocess=True,
    verbosity=logging.INFO,
    model_explainability=False
)

local_run = exp.submit(config, show_output=True)

In [None]:
best_run, fitted_model = local_run.get_output()

fitted_model    

In [None]:
classifier = fitted_model.named_steps['prefittedsoftvotingclassifier']
classifier

In [None]:
classifier.__module__

In [None]:
classifier.estimators

In [None]:
X_test = pd.read_csv('./data/test.csv').drop(columns=['PassengerId'])
generate_submission_file(fitted_model, X_test)

## Better metrics

In [None]:
config = AutoMLConfig(
    task='classification',
    primary_metric='accuracy',
    iterations=10,
    X=X,
    y=y,
    n_cross_validations=5,
    preprocess=True,
    verbosity=logging.INFO,
    model_explainability=False
)

local_run = exp.submit(config, show_output=True)

In [None]:
best_run, fitted_model = local_run.get_output()

X_test = pd.read_csv('./data/test.csv').drop(columns=['PassengerId'])
generate_submission_file(fitted_model, X_test)

## Explainability

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
config = AutoMLConfig(
    task='classification',
    primary_metric='accuracy',
    iterations=5,
    X = X_train,
    y = y_train,
    X_valid = X_test,
    y_valid = y_test,
    preprocess=True,
    verbosity=logging.INFO,
    model_explainability=True
)


local_run = exp.submit(config, show_output=True)

In [None]:
from azureml.train.automl.automlexplainer import explain_model

shap_values, expected_values, overall_summary, overall_imp, per_class_summary, per_class_imp = \
    explain_model(fitted_model, X_train, X_test, features = X_train.columns)

print('===Overall Summary===')
print(overall_summary)
print('===Overall Importance===')
print(overall_imp)
print('===Per-Class Summary===')
print(per_class_summary)
print('===Per-Class Importance===')
print(per_class_imp)

In [None]:
import numpy as np

overall_importance = pd.DataFrame(data=np.stack([overall_imp, overall_summary]), index=['Column', 'Weight']).T

In [None]:
overall_importance.head(10)

# The End