In [None]:
import pandas as pd
import os
os.chdir('/')

In [None]:
features = pd.read_csv('df_for_regression.csv')
features = features.iloc[: , 1:]

features.head(5)

Unnamed: 0,block_number,value,gas,is_to_address_uni_pool,is_sipher_inu_transaction,gas_used,is_to_uniswap_router
0,13188414,0.0,46703,False,False,46703,False
1,13188414,0.0,450000,False,False,246088,False
2,13188414,2.3e+17,215887,False,False,163754,False
3,13188414,0.0,450000,False,False,246090,False
4,13188414,0.0,74926,False,False,74926,False


In [None]:
print('The shape of our features is:', features.shape)
features.iloc[: , -6:].describe()

The shape of our features is: (1258391, 7)


Unnamed: 0,value,gas,gas_used
count,1258391.0,1258391.0,1258391.0
mean,2.016572e+18,183713.1,81846.44
std,3.167764e+20,411099.6,167269.1
min,0.0,21000.0,21000.0
25%,0.0,42000.0,21000.0
50%,0.0,100000.0,46109.0
75%,1e+17,232546.0,86518.0
max,2.23553e+23,30000000.0,23080420.0


In [None]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['gas_used'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('gas_used', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 123)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (943793, 6)
Training Labels Shape: (943793,)
Testing Features Shape: (314598, 6)
Testing Labels Shape: (314598,)


In [None]:
baseline_predictions =  test_features[:, feature_list.index('gas')]
baseline_errors = abs(baseline_predictions - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

Average baseline error:  101241.85


In [None]:
mape_baseline = 100 * (baseline_errors / test_labels)

# Calculate and display accuracy
baseline_accuracy = 100 - np.mean(mape_baseline)
print('Accuracy:', round(baseline_accuracy, 2), '%.')

Accuracy: -112.17 %.


In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 123)

# Train the model on training data
rf.fit(train_features, train_labels);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 13980.49


In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 81.34 %.


In [None]:
import pickle

filename = 'random_forest_model.sav'
pickle.dump(rf, open(filename, 'wb'))

In [15]:
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png')

In [None]:
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)

# Extract the small tree
tree_small = rf_small.estimators_[5]

# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');

In [17]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: gas                  Importance: 0.84
Variable: block_number         Importance: 0.09
Variable: value                Importance: 0.05
Variable: is_sipher_inu_transaction Importance: 0.01
Variable: is_to_uniswap_router Importance: 0.01
Variable: is_to_address_uni_pool Importance: 0.0


In [19]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 100, random_state=42)

# Extract the two most important features
important_indices = [feature_list.index('gas'), feature_list.index('block_number'), feature_list.index('value')]

train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]

# Train the random forest
rf_most_important.fit(train_important, train_labels)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_labels)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))

accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')


Mean Absolute Error: 14309.2 degrees.
Accuracy: 81.01 %.
