In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
from sklearn.pipeline import Pipeline

# Step One: Training Data Preparation 

In [2]:
def prepare_training_data():
    col_names = ['programming_language', 'cpu_intensity', 'memory_intensity', 'memory_size', 'provider', 'throughput']
    data = pd.read_csv('data/Machine-Learning-Data.csv', header=None, names=col_names)
    
    # Format the data to map categorical to numerical values
    map_language_to_number = {'Java ': 0, 'Python': 1, 'Ruby': 2, 'NodeJs': 3, 'Go': 4}
    map_mem_intensity = {'no': 0, 'yes': 1}
    map_cpu_intensity_to_number = {'low': 1, 'medium': 2, 'high': 3, 'no': 0}

    # Format the main data
    data['programming_language'] = data['programming_language'].map(map_language_to_number)
    data['cpu_intensity'] = data['cpu_intensity'].map(map_cpu_intensity_to_number)
    data['memory_intensity'] = data['memory_intensity'].map(map_mem_intensity)
    
    # Shuffle the data
    data = shuffle(data)
    data = data.sample(frac=1)
    
    # split into feature and label
    features = ['programming_language', 'cpu_intensity', 'memory_intensity', 'memory_size', 'provider']
    label = ['throughput']

    X_train = data[features]
    y_train = data[label]  
    
    return X_train, y_train

# Step Two : Test Data Preparation

In [3]:
def prepare_test_data():
    # Read the testing data
    col_names = ['programming_language', 'cpu_intensity', 'memory_intensity', 'memory_size', 'provider', 'throughput']
    data = pd.read_csv('data/testing-data.txt', header=None, names=col_names)

    # Format the data to map categorical to numerical values
    map_language_to_number = {'Java ': 0, 'Python': 1, 'Ruby': 2, 'NodeJs': 3, 'Go': 4}
    map_mem_intensity = {'no': 0, 'yes': 1}
    map_cpu_intensity_to_number = {'low': 1, 'medium': 2, 'high': 3, 'no': 0}

    # Format the main data
    data['programming_language'] = data['programming_language'].map(map_language_to_number)
    data['cpu_intensity'] = data['cpu_intensity'].map(map_cpu_intensity_to_number)
    data['memory_intensity'] = data['memory_intensity'].map(map_mem_intensity)

    # split into feature and label
    features = ['programming_language', 'cpu_intensity', 'memory_intensity', 'memory_size', 'provider']
    label = ['throughput']
    
    x_test = data[features]
    y_test = data[label]

    return x_test, y_test

# Step Three : Train Model 

In [4]:
# Create a pipeline for Decision Tree Regression
dtr_pipeline = Pipeline([
    ('scalar', StandardScaler()),
    ('dtr_model', DecisionTreeRegressor(max_depth=5))
])

In [5]:
# Get The Training Data
X_train,y_train = prepare_training_data()

In [6]:
# Fit the Data 
dtr_pipeline.fit(X_train,y_train)

Pipeline(steps=[('scalar', StandardScaler()),
                ('dtr_model', DecisionTreeRegressor(max_depth=5))])

### Print Tree Structure 

In [7]:
from sklearn.tree import export_graphviz

features = ['programming_language', 'cpu_intensity', 'memory_intensity', 'memory_size', 'provider']
label = ['throughput']

export_graphviz(dtr_pipeline[1],out_file='tree.dot')

# Step Four : Test Model 

### Test Model using Training Data 

In [8]:
accuracy = dtr_pipeline.score(X_train,y_train)
print("Accuracy : {}".format(accuracy))

Accuracy : 0.9542864398605515


### Test Model using Testing Data

In [9]:
# Get the testing data 
x_test,y_test = prepare_test_data()
test_accuracy = dtr_pipeline.score(x_test,y_test)
print("Accuracy : {}".format(test_accuracy))

Accuracy : 0.9492381009305559


In [10]:
# print out predicted and actual
y_prediction = dtr_pipeline.predict(x_test)

prediction = y_prediction.tolist()
actual = y_test.values.tolist()

for i in range(0,3):
    print("Predicted value: {} | Actual Value: {}".format(prediction[i],actual[i]))

Predicted value: 0.0 | Actual Value: [0.0]
Predicted value: 3.7592340240000004 | Actual Value: [1.4897308113469685]
Predicted value: 305.36249999999995 | Actual Value: [373.98699999999997]


### Cross Fold Validation 

In [15]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

cv = RepeatedKFold(n_splits=10,n_repeats=3)

scores = cross_val_score(estimator=dtr_pipeline, X=X_train,y=y_train, scoring='r2', cv=cv, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (scores.mean(), scores.std()))

Accuracy: 0.749 (0.204)
