In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import sqrt
from math import floor
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_squared_error

#Instantiate the spark session
spark = SparkSession.builder.appName("GBat_model").getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 8)

In [2]:
def sMAPE(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred))
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    sma =200 * np.mean(diff)
    return sma

In [3]:
#split each group and return first 80% cycles 
def train_split(grp):
    grp = grp.sort_values(by=['cycle'])
    idx = floor(0.80*grp.shape[0])
    return  grp.iloc[:idx]

#split each group and return rest 20% cycles
def test_split(grp):
    grp = grp.sort_values(by=['cycle'])
    idx = floor(0.80*grp.shape[0])
    return grp.iloc[idx:]

In [4]:
cyc_agg_DF = spark.read.csv('/FileStore/tables/new_cyc_agg_DF.csv', inferSchema = True, header = True).cache()

In [5]:
cyc_agg_DF.limit(10).toPandas()

Unnamed: 0,cell_no,protocol,cycle,di,min_ocv,max_ocv,rng_ocv,charge_duration,dur_by_ocv,i0x91,i0x2,i0xcd,i0x28,i0xb1,i0x83,i0x8c,i0x6,i0xa7,i0x2a,i0x8a,i0x94,i0x73,B_65,B_78,T_5a,T_b6,c_const_B_2d,c_const_T_c4,c_const_B_81,c_const_B_40,c_const_T_32,c_const_T_bc,c_const_B_6b,c_const_B_9,c_const_B_3b,c_const_B_c9,c_const_B_b2,c_const_B_14,c_const_B_76,c_const_B_29,c_const_T_5,P_const_30,P_const_9f,P_const_2c
0,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,4,0.858171,3329.0,4223.0,894.0,2309.719999,2.583579,0.434783,0.599933,0.0,0.851064,0.249766,0.653333,0.521605,1.0,0.084746,0.899833,0.253061,0.0,0.0,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,5,0.837832,3439.0,4231.0,792.0,1945.888,2.456929,0.434783,0.599933,0.024512,0.851064,0.249766,0.655652,0.530193,1.0,0.084746,0.882413,0.253061,0.016339,0.019608,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,6,0.695297,3420.0,4221.0,801.0,2355.447,2.940633,0.434783,0.599933,0.004502,0.851064,0.249766,0.717538,0.759402,1.0,0.084746,0.374342,0.092182,0.003001,0.003601,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,7,0.693164,3419.0,4221.0,802.0,2361.902001,2.945015,0.434783,0.599933,0.009505,0.851064,0.249766,0.718182,0.761785,1.0,0.084746,0.369151,0.090847,0.006335,0.007603,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,8,0.702145,3422.0,4220.0,798.0,2316.717,2.903154,0.434783,0.599933,0.01951,0.851064,0.249766,0.71322,0.743409,1.0,0.084746,0.409185,0.101141,0.013004,0.015606,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,9,0.727433,3476.0,4222.0,746.0,1537.844999,2.061454,0.434783,0.599933,0.03952,0.851064,0.249766,0.71814,0.761628,1.0,0.084746,0.369492,0.090935,0.026342,0.031613,1.0,1.0,0.0,0.75,1.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,10,0.696835,3421.0,4220.0,799.0,2336.082,2.923757,0.434783,0.599933,0.014507,0.851064,0.249766,0.715484,0.751792,1.0,0.084746,0.39092,0.096445,0.004636,0.011605,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,11,0.743833,3470.0,4220.0,750.0,1512.025,2.016033,0.434783,0.599933,0.034517,0.851064,0.249766,0.713846,0.745726,1.0,0.084746,0.404135,0.099843,0.023008,0.027611,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,12,0.73115,3451.0,4220.0,769.0,1797.647,2.337642,0.434783,0.599933,0.029515,0.851064,0.249766,0.710909,0.734848,1.0,0.084746,0.427834,0.105937,0.019673,0.023609,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,13,0.711747,3421.0,4221.0,800.0,2297.352,2.87169,0.434783,0.599933,0.002001,0.851064,0.249766,0.710714,0.734127,1.0,0.084746,0.429406,0.106341,0.001334,0.001601,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
#split the data into train and test
myschema = cyc_agg_DF.schema

train_DF = cyc_agg_DF.select("*") \
  .groupBy("cell_no","protocol") \
  .applyInPandas(train_split, schema = myschema) \
  .orderBy("cell_no", "protocol", "cycle") \
  .cache()
  
test_DF = cyc_agg_DF.select("*") \
  .groupBy("cell_no","protocol") \
  .applyInPandas(test_split, schema = myschema) \
  .orderBy("cell_no", "protocol", "cycle") \
  .cache()

print(train_DF.count())
print(test_DF.count())

In [7]:
#columns which are not features
drop_cols = ['cell_no', 'protocol', 'di', 'dur_by_ocv', 'charge_duration']

#Vector Assembler
assembler = VectorAssembler(inputCols= [col for col in cyc_agg_DF.columns if col not in drop_cols], outputCol="features")
    
#Random Forest Regressor
rf = RandomForestRegressor(labelCol='di', featuresCol="features", numTrees = 50, maxDepth = 25)

#pipeline
pipeline = Pipeline(stages=[assembler, rf])

#Training
RFModel = pipeline.fit(train_DF)

In [8]:
#Predictions on test set
test_df_with_pred = RFModel.transform(test_DF)

#Predictions on train set
train_df_with_pred = RFModel.transform(train_DF)

#convert to pandas for plotting
test_df_pd = test_df_with_pred.toPandas()
train_df_pd = train_df_with_pred.toPandas()

In [9]:
#Evaluate the model using metric like Mean Absolute Error(MAE), Root Mean Square Error(RMSE) and R-Square
evaluation = RegressionEvaluator(labelCol='di', predictionCol="prediction")

# r2 - coefficient of determination
r2 = evaluation.evaluate(test_df_with_pred, {evaluation.metricName: "r2"})

#RMSE
rmse = evaluation.evaluate(test_df_with_pred, {evaluation.metricName: "rmse"})

#sMape result
smape = sMAPE(test_df_pd['di'], test_df_pd.prediction)

In [10]:
print("r2: %.3f" %r2, '\n')
print("RMSE: %.3f" %rmse, '\n')
print('sMAPE: %f' %smape, '\n' )

In [11]:
di_importances = RFModel.stages[1].featureImportances

# Creating DF from the selected features to define importances
di_features_DF = cyc_agg_DF[[col for col in cyc_agg_DF.columns if col not in drop_cols]]

# Mapping the features coeeficient to their names for easy reading
di_featureNames = map(lambda s: s.name, di_features_DF.schema.fields)
feat_name_importance = zip(di_featureNames, di_importances)

#Creating the dataframe of importances.
feat_name_importance_DF = spark.createDataFrame(sc.parallelize(feat_name_importance).map(lambda r: [r[0], float(r[1])]))
feat_name_importance_DF = feat_name_importance_DF.withColumnRenamed("_1", "features").withColumnRenamed("_2", "importance")

In [12]:
display(feat_name_importance_DF)

features,importance
cycle,0.0736774852199482
min_ocv,0.0377224312659555
max_ocv,0.0364583710514901
rng_ocv,0.02920016917726
i0x91,0.0531051085043576
i0x2,0.0179884911028711
i0xcd,0.0125561247472411
i0x28,0.072684110679422
i0xb1,0.0114813551614331
i0x83,0.0438218468334748


In [13]:
#Random Forest Regressor
rf2 = RandomForestRegressor(labelCol='dur_by_ocv', featuresCol="features", numTrees = 50, maxDepth = 25)

#pipeline
pipeline2 = Pipeline(stages=[assembler, rf2])

#Training
RFModel2 = pipeline2.fit(train_DF)

In [14]:
#Predictions test set
test_df_with_pred2 = RFModel2.transform(test_DF)

#Predictions train set
train_df_with_pred2 = RFModel2.transform(train_DF)

#convert to pandas for plotting
test_df_pd2 = test_df_with_pred2.toPandas()
train_df_pd2 = train_df_with_pred2.toPandas()

In [15]:
#Evaluate the model using metric like Mean Absolute Error(MAE), Root Mean Square Error(RMSE) and R-Square
evaluation = RegressionEvaluator(labelCol='dur_by_ocv', predictionCol="prediction")

# r2 - coefficient of determination
r2 = evaluation.evaluate(test_df_with_pred2, {evaluation.metricName: "r2"})

#RMSE
rmse = evaluation.evaluate(test_df_with_pred2, {evaluation.metricName: "rmse"})

#sMape result
smape = sMAPE(test_df_pd2['dur_by_ocv'], test_df_pd2.prediction)

In [16]:
print("r2: %.3f" %r2, '\n')
print("RMSE: %.3f" %rmse, '\n')
print('sMAPE: %f' %smape, '\n' )

In [17]:
dur_by_ocvimportances = RFModel2.stages[1].featureImportances

# Creating DF from the selected features to define importances
dur_by_ocvfeatures_DF = cyc_agg_DF[[col for col in cyc_agg_DF.columns if col not in drop_cols]]

# Mapping the features coeeficient to their names for easy reading
dur_by_ocvfeatureNames = map(lambda s: s.name, dur_by_ocvfeatures_DF.schema.fields)
feat_name_importance = zip(dur_by_ocvfeatureNames, dur_by_ocvimportances)

#Creating the dataframe of importances.
feat_name_importance_DF = spark.createDataFrame(sc.parallelize(feat_name_importance).map(lambda r: [r[0], float(r[1])]))
feat_name_importance_DF = feat_name_importance_DF.withColumnRenamed("_1", "features").withColumnRenamed("_2", "importance")

In [18]:
display(feat_name_importance_DF)

features,importance
cycle,0.0102827504650505
min_ocv,0.0179534238680192
max_ocv,0.020781663052004
rng_ocv,0.0214654165245508
i0x91,0.0494130950881071
i0x2,0.0226179685489607
i0xcd,0.0051998435388918
i0x28,0.0845158815493049
i0xb1,0.0193016811245044
i0x83,0.1888032178135073
