In [1]:
# Write a csv file with all the scheduled events
# use spark to read the csv file and create a dataframe
from ast import literal_eval
import os
import random
import string
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark import SparkConf, SparkContext
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)


# Get the list of files
files = os.listdir('data')

data = pd.read_csv('data/merge_all.csv')

data2 = data
# Remove columns with label producer and consumer percentage_success
data = data.drop(['producer', 'consumer', 'batch_size'], axis=1)

# Remove all the average time > 30
data = data[data['average'] < 30]

# Remove all the (average time > 1.3 and flow < 2500)
data = data[~((data['average'] > 1.3) & (data['flow'] < 2500))]

# check the number deleted rows
print("Number of rows deleted: "+str(len(data2) - len(data)))

# Save the data to a csv file
data.to_csv('data/'+"data_Kafka_train.csv", index=False)





machineKafkaTypeDict = {val:i for i, val in enumerate(data.machine_kafka.unique())}

print(machineKafkaTypeDict)

# map the machine_kafka to a number
data['machine_kafka'] = data['machine_kafka'].map(machineKafkaTypeDict)

data.head()

# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(data.drop('average', axis=1), data['average'], test_size=0.20, random_state=10)

regressor = RandomForestRegressor(n_estimators=80, random_state=0, max_depth=8)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print("RandomForestRegressor")
# calculate the mean absolute error
print(mean_absolute_error(y_test, y_pred))

# calculate the mean squared error
print(mean_squared_error(y_test, y_pred))

# calculate the accuracy
print(regressor.score(X_test, y_test))

# Show the feature importance
print(regressor.feature_importances_)

# Try with GradientBoostingRegressor
regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=7, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print("GradientBoostingRegressor")
# calculate the mean absolute error
print(mean_absolute_error(y_test, y_pred))

# calculate the mean squared error
print(mean_squared_error(y_test, y_pred))

# calculate the accuracy
print(regressor.score(X_test, y_test))

# Show the feature importance
print(regressor.feature_importances_)

# Try with DecisionTreeRegressor 
# spliter = 'best'
# max_depth = 7
# criterion = 'squared_error'
regressor = DecisionTreeRegressor(random_state=0, splitter='best', max_depth=7)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print("DecisionTreeRegressor")
# calculate the mean absolute error
print(mean_absolute_error(y_test, y_pred))

# calculate the mean squared error
print(mean_squared_error(y_test, y_pred))

# calculate the accuracy
print(regressor.score(X_test, y_test))

# Show the feature importance
print(regressor.feature_importances_)



Number of rows deleted: 2973
{'DS3_v2': 0, 'DS2_v2': 1, 'D4s_v3': 2, 'B4ms': 3, 'D2as_v4': 4, 'B2ms': 5}
RandomForestRegressor
0.04385502816808113
0.11055802846084488
0.975889940392223
[0.07721876 0.01316557 0.69977108 0.20984459]
GradientBoostingRegressor
0.049186838197202515
0.14853744707306268
0.9676075382966249
[0.08141106 0.0193642  0.79551868 0.10370605]
DecisionTreeRegressor
0.053548051286809575
0.180472786180476
0.9606432052654241
[0.07136064 0.01078764 0.80870326 0.10914846]
