In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

import pandas as pd
from autofeat import FeatureSelector, AutoFeatRegressor

#Instantiate the spark session
spark = SparkSession.builder.appName("Feature_Engineering").getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 8)

In [2]:
#read csv file with aggregated data
cyc_agg_DF = spark.read.csv("/FileStore/tables/agg_clean_DF.csv", inferSchema = True, header = True).cache()

#materialize the DF in memory
cyc_agg_DF.count()

In [3]:
cyc_agg_DF_pddf = cyc_agg_DF.toPandas()

In [4]:
#get dummies for each cell_no
cyc_agg_DF_pddf = pd.get_dummies(cyc_agg_DF_pddf, prefix=['cell'], columns = ['cell_no'])

cyc_agg_DF_pddf.head()

Unnamed: 0,protocol,cycle,di,dur_by_ocv,min_ocv,max_ocv,rng_ocv,charge_duration,i0x91,i0x2,i0xcd,i0x28,i0xb1,i0x83,i0x8c,i0x6,i0xa7,i0x2a,i0x8a,i0x94,i0x73,B_65,B_78,T_5a,T_b6,c_const_B_2d,c_const_T_c4,c_const_B_81,c_const_B_40,c_const_T_32,c_const_T_bc,c_const_B_6b,c_const_B_9,c_const_B_3b,c_const_B_c9,c_const_B_b2,c_const_B_14,c_const_B_76,c_const_B_29,c_const_T_5,P_const_30,P_const_9f,P_const_2c,cell_1,cell_3,cell_4,cell_5,cell_6,cell_7,cell_9,cell_10,cell_11,cell_13,cell_14,cell_18,cell_19,cell_20,cell_23,cell_24,cell_26,cell_27,cell_28,cell_29
0,140f77741820c02177597651dfea9fe881c1a73d8e4002...,4,0.858171,2.583579,3329.0,4223.0,894.0,2309.719999,0.434783,0.599933,0.0,0.851064,0.249766,0.653333,0.521605,1.0,0.084746,0.899833,0.253061,0.0,0.0,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,5,0.837832,2.456929,3439.0,4231.0,792.0,1945.888,0.434783,0.599933,0.024512,0.851064,0.249766,0.655652,0.530193,1.0,0.084746,0.882413,0.253061,0.016339,0.019608,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,140f77741820c02177597651dfea9fe881c1a73d8e4002...,6,0.695297,2.940633,3420.0,4221.0,801.0,2355.447,0.434783,0.599933,0.004502,0.851064,0.249766,0.717538,0.759402,1.0,0.084746,0.374342,0.092182,0.003001,0.003601,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,140f77741820c02177597651dfea9fe881c1a73d8e4002...,7,0.693164,2.945015,3419.0,4221.0,802.0,2361.902001,0.434783,0.599933,0.009505,0.851064,0.249766,0.718182,0.761785,1.0,0.084746,0.369151,0.090847,0.006335,0.007603,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,140f77741820c02177597651dfea9fe881c1a73d8e4002...,8,0.702145,2.903154,3422.0,4220.0,798.0,2316.717,0.434783,0.599933,0.01951,0.851064,0.249766,0.71322,0.743409,1.0,0.084746,0.409185,0.101141,0.013004,0.015606,1.0,1.0,0.0,0.75,0.0,0.0,0.0,1.0,0.5,0.090909,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
#model input
X = cyc_agg_DF_pddf.drop(['protocol','di', 'charge_duration', 'dur_by_ocv', 'cell_1'], axis = 1 )

#model outputs (di and dur_by_ocv)
Y_di = cyc_agg_DF_pddf['di']
Y_dur_by_ocv = cyc_agg_DF_pddf['dur_by_ocv']

In [6]:
#Select the features to used for feature engineering (ignore dummies)
feat_use = [col for col in X.columns if not col.startswith('cell_')]

#Select the features to keep during selection (dummies) 
feat_keep = [col for col in X.columns if col.startswith('cell_')]

In [7]:
print(feat_keep)

In [8]:
#create the autofeat model for regression
af_regr_di = AutoFeatRegressor(feateng_cols = feat_use, 
                               feateng_steps= 2,
                               transformations=("1/", "exp", "log", "abs", "sqrt", "^2", "^3", "1+", "1-", "sin", "cos", "exp-", "2^"),
                               verbose=1,
                               n_jobs=6)

#fit and transform X to obtain engineered features for di
new_df_di = af_regr_di.fit_transform(X, Y_di)
r2_di = af_regr_di.score(X, Y_di)
print("## Final R^2: %.4f" % r2_di)

In [9]:
#feature selector for di
fsel_di = FeatureSelector(verbose=1,
                          keep = feat_keep,
                          n_jobs=6)

#crete DF with selected features for di
di_FE_cyc_agg_DF = fsel_di.fit_transform(new_df_di, Y_di)

In [10]:
di_FE_cyc_agg_DF.head()

Unnamed: 0,cell_3,cell_4,cell_5,cell_6,cell_7,cell_9,cell_10,cell_11,cell_13,cell_14,cell_18,cell_19,cell_20,cell_23,cell_24,cell_26,cell_27,cell_28,cell_29,T_b6,cycle,i0x2a,i0x8c,c_const_B_14,c_const_B_6b,cycle*sin(i0x91),T_5a**3*cycle**2,i0x28**3*sin(i0x2a),cycle**2*sqrt(i0x94),i0x8c**2*(1 - i0x8c),sqrt(i0xb1)*(1 - i0xb1),c_const_B_b2*cos(i0x28),c_const_T_c4**3*i0x91**3,cycle**2*exp(c_const_T_32),i0x2,T_5a,i0x73,i0xb1,i0x83,i0x91,i0x8a,rng_ocv,i0x8c**2*(1 - T_5a),i0x28**3*sin(i0x8a),(1 - i0x28)*sin(i0x28),sqrt(i0x91)*(1 - i0x91),c_const_B_b2*rng_ocv**2,sqrt(i0xa7)*exp(c_const_T_5),sqrt(i0xa7)*cos(c_const_T_32),i0x6,i0x28,c_const_T_c4,B_78**3*(1 - i0x28),c_const_T_32**2*i0x2a,max_ocv,min_ocv,c_const_T_32,c_const_B_76,cycle*i0x2a**2,2**B_78*exp(T_5a),exp(i0x2a)*exp(-i0xa7)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,4.0,0.899833,0.521605,0.0,1.0,1.684853,0.0,0.482805,0.0,0.130158,0.374941,0.659184,0.0,26.37954,0.599933,0.0,0.0,0.249766,0.653333,0.434783,0.253061,894.0,0.272072,0.154336,0.111997,0.372693,799236.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.224958,4223.0,3329.0,0.5,0.0,3.238798,2.0,2.259373
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,5.0,0.882413,0.530193,0.0,1.0,2.106066,0.0,0.476056,3.195581,0.132065,0.374941,0.659184,0.0,41.218032,0.599933,0.0,0.019608,0.249766,0.655652,0.434783,0.253061,792.0,0.281105,0.154336,0.111997,0.372693,627264.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.220603,4231.0,3439.0,0.5,0.0,3.893261,2.0,2.220355
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,6.0,0.374342,0.759402,0.0,1.0,2.527279,0.0,0.225405,1.97213,0.138751,0.374941,0.659184,0.0,59.353966,0.599933,0.0,0.003601,0.249766,0.717538,0.434783,0.092182,801.0,0.576691,0.056744,0.111997,0.372693,641601.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.093585,4221.0,3420.0,0.5,0.0,0.840791,2.0,1.335888
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,7.0,0.369151,0.761785,0.0,1.0,2.948493,0.0,0.222424,3.90018,0.13824,0.374941,0.659184,0.0,80.787342,0.599933,0.0,0.007603,0.249766,0.718182,0.434783,0.090847,802.0,0.580316,0.055924,0.111997,0.372693,643204.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.092288,4221.0,3419.0,0.5,0.0,0.953905,2.0,1.328971
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,8.0,0.409185,0.743409,0.0,1.0,3.369706,0.0,0.245255,7.298339,0.141807,0.374941,0.659184,0.0,105.518161,0.599933,0.0,0.015606,0.249766,0.71322,0.434783,0.101141,798.0,0.552656,0.062241,0.111997,0.372693,636804.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.102296,4220.0,3422.0,0.5,0.0,1.339458,2.0,1.383254


In [11]:
#add 'cell_no', 'protcol', 'di', 'ocv' back to the data
info = cyc_agg_DF.toPandas()
info = info[['cell_no', 'protocol', 'cycle', 'di', 'dur_by_ocv']]
di_FE_cyc_agg_DF = pd.concat([info, di_FE_cyc_agg_DF], axis = 1)

#this df has 'cycle' column two times, remove the duplicate column
di_FE_cyc_agg_DF = di_FE_cyc_agg_DF.loc[:,~di_FE_cyc_agg_DF.columns.duplicated()]

In [12]:
#df with engineered features
di_FE_cyc_agg_DF.head()

Unnamed: 0,cell_no,protocol,cycle,di,dur_by_ocv,cell_3,cell_4,cell_5,cell_6,cell_7,cell_9,cell_10,cell_11,cell_13,cell_14,cell_18,cell_19,cell_20,cell_23,cell_24,cell_26,cell_27,cell_28,cell_29,T_b6,i0x2a,i0x8c,c_const_B_14,c_const_B_6b,cycle*sin(i0x91),T_5a**3*cycle**2,i0x28**3*sin(i0x2a),cycle**2*sqrt(i0x94),i0x8c**2*(1 - i0x8c),sqrt(i0xb1)*(1 - i0xb1),c_const_B_b2*cos(i0x28),c_const_T_c4**3*i0x91**3,cycle**2*exp(c_const_T_32),i0x2,T_5a,i0x73,i0xb1,i0x83,i0x91,i0x8a,rng_ocv,i0x8c**2*(1 - T_5a),i0x28**3*sin(i0x8a),(1 - i0x28)*sin(i0x28),sqrt(i0x91)*(1 - i0x91),c_const_B_b2*rng_ocv**2,sqrt(i0xa7)*exp(c_const_T_5),sqrt(i0xa7)*cos(c_const_T_32),i0x6,i0x28,c_const_T_c4,B_78**3*(1 - i0x28),c_const_T_32**2*i0x2a,max_ocv,min_ocv,c_const_T_32,c_const_B_76,cycle*i0x2a**2,2**B_78*exp(T_5a),exp(i0x2a)*exp(-i0xa7)
0,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,4,0.858171,2.583579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.899833,0.521605,0.0,1.0,1.684853,0.0,0.482805,0.0,0.130158,0.374941,0.659184,0.0,26.37954,0.599933,0.0,0.0,0.249766,0.653333,0.434783,0.253061,894.0,0.272072,0.154336,0.111997,0.372693,799236.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.224958,4223.0,3329.0,0.5,0.0,3.238798,2.0,2.259373
1,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,5,0.837832,2.456929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.882413,0.530193,0.0,1.0,2.106066,0.0,0.476056,3.195581,0.132065,0.374941,0.659184,0.0,41.218032,0.599933,0.0,0.019608,0.249766,0.655652,0.434783,0.253061,792.0,0.281105,0.154336,0.111997,0.372693,627264.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.220603,4231.0,3439.0,0.5,0.0,3.893261,2.0,2.220355
2,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,6,0.695297,2.940633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.374342,0.759402,0.0,1.0,2.527279,0.0,0.225405,1.97213,0.138751,0.374941,0.659184,0.0,59.353966,0.599933,0.0,0.003601,0.249766,0.717538,0.434783,0.092182,801.0,0.576691,0.056744,0.111997,0.372693,641601.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.093585,4221.0,3420.0,0.5,0.0,0.840791,2.0,1.335888
3,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,7,0.693164,2.945015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.369151,0.761785,0.0,1.0,2.948493,0.0,0.222424,3.90018,0.13824,0.374941,0.659184,0.0,80.787342,0.599933,0.0,0.007603,0.249766,0.718182,0.434783,0.090847,802.0,0.580316,0.055924,0.111997,0.372693,643204.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.092288,4221.0,3419.0,0.5,0.0,0.953905,2.0,1.328971
4,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,8,0.702145,2.903154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.409185,0.743409,0.0,1.0,3.369706,0.0,0.245255,7.298339,0.141807,0.374941,0.659184,0.0,105.518161,0.599933,0.0,0.015606,0.249766,0.71322,0.434783,0.101141,798.0,0.552656,0.062241,0.111997,0.372693,636804.0,0.291111,0.255474,1.0,0.851064,0.0,0.148936,0.102296,4220.0,3422.0,0.5,0.0,1.339458,2.0,1.383254


In [13]:
di_df = spark.createDataFrame(di_FE_cyc_agg_DF)

In [14]:
#write the di_FE_cyc_agg_DF as csv file to use later 

di_df.coalesce(1) \
.orderBy("cell_no","protocol", "cycle") \
.write.format("com.databricks.spark.csv") \
.option("header", "true") \
.save("/FileStore/tables/dir_di_FE_cyc_agg_DF.csv")

In [15]:
#create the autofeat model for regression
af_regr_dur_by_ocv = AutoFeatRegressor(feateng_cols = feat_use, 
                               feateng_steps= 2,
                               transformations=("1/", "exp", "log", "abs", "sqrt", "^2", "^3", "1+", "1-", "sin", "cos", "exp-", "2^"),
                               verbose=1,
                               n_jobs=6)

#fit and transform X to obtain engineered features for dur_by_ocv
new_df_dur_by_ocv = af_regr_dur_by_ocv.fit_transform(X, Y_dur_by_ocv)
r2_dur_by_ocv = af_regr_dur_by_ocv.score(X, Y_dur_by_ocv)
print("## Final R^2: %.4f" % r2_dur_by_ocv)

In [16]:
#feature selector for dur_by_ocv
fsel_dur_by_ocv = FeatureSelector(verbose=1,
                          keep = feat_keep,
                          n_jobs=6)

#crete DF with selected features for dur_by_ocv
dur_by_ocv_FE_cyc_agg_DF = fsel_dur_by_ocv.fit_transform(new_df_dur_by_ocv, Y_dur_by_ocv)

In [17]:
#add 'cell_no', 'protcol', 'di', 'ocv' back
info = cyc_agg_DF.toPandas()
info = info[['cell_no', 'protocol', 'cycle', 'di', 'dur_by_ocv']]
dur_by_ocv_FE_cyc_agg_DF = pd.concat([info, dur_by_ocv_FE_cyc_agg_DF], axis = 1)

#this df has 'cycle' column two times, remove the duplicate column
dur_by_ocv_FE_cyc_agg_DF = dur_by_ocv_FE_cyc_agg_DF.loc[:,~dur_by_ocv_FE_cyc_agg_DF.columns.duplicated()]

In [18]:
dur_by_ocv_FE_cyc_agg_DF.head()

Unnamed: 0,cell_no,protocol,cycle,di,dur_by_ocv,cell_3,cell_4,cell_5,cell_6,cell_7,cell_9,cell_10,cell_11,cell_13,cell_14,cell_18,cell_19,cell_20,cell_23,cell_24,cell_26,cell_27,cell_28,cell_29,i0x6,i0xb1,i0x83,i0x2a,i0x28,i0x91,i0x8c,max_ocv,rng_ocv,P_const_2c,c_const_B_9,c_const_T_32,c_const_B_b2,P_const_2c/cycle,c_const_B_6b*cycle,min_ocv**3*rng_ocv,(1 - i0xcd)/rng_ocv,i0x2a**2*sin(i0xcd),cycle**3*sin(i0xcd),sqrt(i0x8a)/rng_ocv,i0xa7**3*(1 - i0x28),min_ocv**3*cos(i0xa7),rng_ocv**3*sin(i0xb1),(1 - i0x8c)*exp(-B_78),(1 - i0x28)*sin(i0x28),P_const_2c*c_const_B_3b,sqrt(i0x91)*(1 - i0x91),P_const_30*cos(max_ocv),sqrt(i0xb1)*(1 - i0xb1),c_const_B_c9*(1 - i0x28),c_const_B_29*cos(max_ocv),sqrt(i0xcd)*sin(c_const_T_5),exp(c_const_T_c4)*sin(i0x91),rng_ocv**3*exp(-c_const_T_32),i0x8a,i0x94,i0x28**3*sin(i0x8a),sqrt(i0x8a)*exp(i0x94),c_const_B_6b*sqrt(c_const_T_5),c_const_B_76*exp(-c_const_T_c4),log(rng_ocv)/rng_ocv,c_const_T_32**2/cycle,c_const_B_9*sqrt(i0x91),c_const_B_6b*rng_ocv**3
0,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,4,0.858171,2.583579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.249766,0.653333,0.899833,0.851064,0.434783,0.521605,4223.0,894.0,0.0,0.0,0.5,1.0,0.0,4.0,32982150000000.0,0.001119,0.0,0.0,0.000563,9.1e-05,36760380000.0,176612000.0,0.175992,0.111997,0.0,0.372693,0.0,0.374941,0.148936,0.765181,0.0,0.421213,433376500.0,0.253061,0.0,0.154336,0.503052,0.0,0.0,0.007601,0.0625,0.0,714516984.0
1,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,5,0.837832,2.456929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.249766,0.655652,0.882413,0.851064,0.434783,0.530193,4231.0,792.0,0.0,0.0,0.5,1.0,0.0,5.0,32212300000000.0,0.001232,0.019085,3.063725,0.000635,9.1e-05,40526130000.0,122795700.0,0.172832,0.111997,0.0,0.372693,-0.0,0.374941,0.148936,-0.748298,0.0,0.421213,301320200.0,0.253061,0.016339,0.154336,0.511339,0.0,0.0,0.008427,0.05,0.0,496793088.0
2,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,6,0.695297,2.940633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.249766,0.717538,0.374342,0.851064,0.434783,0.759402,4221.0,801.0,0.0,0.0,0.5,1.0,0.0,6.0,32041350000000.0,0.001243,0.000631,0.972483,0.000379,9.1e-05,39858130000.0,127029700.0,0.088511,0.111997,0.0,0.372693,0.0,0.374941,0.148936,0.266992,0.0,0.421213,311709700.0,0.092182,0.003001,0.056744,0.304528,0.0,0.0,0.008347,0.041667,0.0,513922401.0
3,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,7,0.693164,2.945015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.249766,0.718182,0.369151,0.851064,0.434783,0.761785,4221.0,802.0,0.0,0.0,0.5,1.0,0.0,7.0,32053220000000.0,0.001235,0.001295,3.260081,0.000376,9.1e-05,39823180000.0,127506100.0,0.087635,0.111997,0.0,0.372693,0.0,0.374941,0.148936,0.266992,0.0,0.421213,312878600.0,0.090847,0.006335,0.055924,0.303324,0.0,0.0,0.008338,0.035714,0.0,515849608.0
4,1,140f77741820c02177597651dfea9fe881c1a73d8e4002...,8,0.702145,2.903154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.249766,0.71322,0.409185,0.851064,0.434783,0.743409,4220.0,798.0,0.0,0.0,0.5,1.0,0.0,8.0,31977380000000.0,0.001229,0.003266,9.988361,0.000399,9.1e-05,39928100000.0,125607700.0,0.094395,0.111997,0.0,0.372693,-0.0,0.374941,0.148936,-0.666668,0.0,0.421213,308220400.0,0.101141,0.013004,0.062241,0.32219,0.0,0.0,0.008374,0.03125,0.0,508169592.0


In [19]:
dur_by_ocv_df = spark.createDataFrame(dur_by_ocv_FE_cyc_agg_DF)

In [20]:
#write the di_FE_cyc_agg_DF as csv file to use later 

dur_by_ocv_df.coalesce(1) \
.orderBy("cell_no","protocol", "cycle") \
.write.format("com.databricks.spark.csv") \
.option("header", "true") \
.save("/FileStore/tables/dir_dur_by_ocv_FE_cyc_agg_DF.csv")