In [95]:
import json
import pandas as pd
import copy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [49]:
doc = json.load(open('spot-advisor-data.json'))
prices_pd = pd.read_csv('data.csv')[['instanceType', 'major', 'minor', 'Type']].drop_duplicates()

In [96]:
class OneHotEncoderTransformer(BaseEstimator, TransformerMixin):
    """
    Apply one hot encoding to each category feature.
    This transformer returns `DataFrame` instead of default `numpy.ndarray`
    """

    def __init__(self):
        self.ohe = OneHotEncoder(sparse=False, drop='first')

    def fit(self, X, y=None):
        X_ = X.select_dtypes(include='category')
        self.ohe.fit(X_, y)
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_cat = X.select_dtypes(include='category')
        X_ohe = pd.DataFrame(self.ohe.transform(X_cat), columns=self.ohe.get_feature_names(X_cat.columns), index=X_.index)
        X_ = pd.concat([X_.drop(columns=X_cat.columns), X_ohe], axis=1)
        return X_

In [110]:
def get_extra_data_for_instance_type(instanceType):
    row = prices_pd[prices_pd.instanceType == instanceType]
    if len(row) == 0:
        return None
    return dict(prices_pd[prices_pd.instanceType == instanceType].iloc[0])


def get_X_y():
    l = []
    for region in doc['spot_advisor']:
        subdoc = doc['spot_advisor'][region]
        for os in subdoc:
            subsubdoc = subdoc[os]
            for instanceType in subsubdoc:
                subsubsubdoc = subsubdoc[instanceType]
                row = copy.deepcopy(subsubsubdoc)
                row['region'] = region
                row['os'] = os
                row['instanceType'] = instanceType
                for attr in doc['instance_types'][instanceType]:
                    row[attr] = doc['instance_types'][instanceType][attr]
                row['major'], row['minor'] = row['instanceType'].split('.')
                l.append(row)
    data = pd.DataFrame(l)

    data = data.astype({
        's': int,
        'region': 'category',
        'os': 'category',
        'instanceType': 'category',
        'ram_gb': int,
        'emr': bool,
        'cores': int,
        'major': 'category',
        'minor': 'category',
    })

    pipe = Pipeline([
        ('ohe', OneHotEncoderTransformer()),
    ])

    X, y = data.drop(columns=['r']), data['r']
    return train_test_split(X, y, test_size=0.33, random_state=42)

In [113]:
def create_pipeline():
    return Pipeline([
        ('ohe', OneHotEncoderTransformer()),
        ('logistic', LogisticRegression(multi_class='multinomial')),
    ])

In [114]:
X_train, X_test, y_train, y_test = get_X_y()

In [115]:
model = create_pipeline()

In [116]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('ohe', OneHotEncoderTransformer()),
                ('logistic', LogisticRegression(multi_class='multinomial'))])

In [117]:
model.score(X_test, y_test)



0.39955481357818584

In [118]:
model.predict(X_test)



array([1, 1, 4, ..., 4, 0, 1])

In [120]:
df = pd.DataFrame(_118)
df['ground_truth'] = y_test
df

Unnamed: 0,0,ground_truth
0,1,2.0
1,1,
2,4,
3,1,4.0
4,2,
...,...,...
3589,4,
3590,4,
3591,4,
3592,0,
