In [1]:
#all spark imports
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

#instantiate the spark session
spark = SparkSession.builder.appName("EDA_summary").getOrCreate()

#set the shuffle partition same as number of cpu cores to improve performance 
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [2]:
#read csv file with all data
all_DF = spark.read.csv("/FileStore/tables/GBatteries_alldata.csv", header = True, inferSchema = True)
                       
#cast time column as timestamp type from unix time
all_DF = all_DF.withColumn("time", all_DF["time"].cast(TimestampType())).cache()

In [3]:
#materialize the DF and display top 5 rows
display(all_DF.limit(5))

time,ocv,di,i0x2d,i0xc4,i0x91,i0x81,i0x40,i0x32,i0x65,i0x2,i0xbc,i0x30,i0x9f,i0x6b,i0x9,i0x8f,i0x3b,i0xc9,i0xb2,i0x14,i0x76,i0x29,i0x2c,i0xcd,i0x28,i0xb1,i0x83,i0x8c,i0x6,i0x5a,i0x78,i0xa7,i0x2a,i0x8a,i0xb6,i0x5,i0x94,i0x73,average_di,charge_duration,cell#,protocol,cycle
2019-11-21T22:40:00.271+0000,2831.0,0.6350710900473934,0.0,0.0,0.2608695652173913,0.0,1.0,0.0,1.0,0.7382897149524921,0.0909090909090909,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0045022511255627,0.7446808510638298,0.0309471709909346,0.652,0.4722222222222222,0.9473684210526316,0.0,1.0,0.0847457627118644,0.031719532554257,0.0204081632653061,0.75,0.0,0.0030010003334444,0.0036014405762304,0.5627556737878638,599.9833333333332,5,fc8e420058ea073a58debc64048ea686c6aefdd3888d8d6005dd66d9ef5c2576,5
2019-11-21T22:40:10.544+0000,2883.0,0.6066350710900474,0.0,0.0,0.2608695652173913,0.0,1.0,0.0,1.0,0.7382897149524921,0.0909090909090909,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0045022511255627,0.7446808510638298,0.0309471709909346,0.652,0.4722222222222222,0.9473684210526316,0.0,1.0,0.0847457627118644,0.031719532554257,0.0204081632653061,0.75,0.0,0.0030010003334444,0.0036014405762304,0.5627556737878638,599.9833333333332,5,fc8e420058ea073a58debc64048ea686c6aefdd3888d8d6005dd66d9ef5c2576,5
2019-11-21T22:40:20.635+0000,2916.0,0.6018957345971564,0.0,0.0,0.2608695652173913,0.0,1.0,0.0,1.0,0.7382897149524921,0.0909090909090909,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0045022511255627,0.7446808510638298,0.0309471709909346,0.652,0.4722222222222222,0.9473684210526316,0.0,1.0,0.0847457627118644,0.031719532554257,0.0204081632653061,0.75,0.0,0.0030010003334444,0.0036014405762304,0.5627556737878638,599.9833333333332,5,fc8e420058ea073a58debc64048ea686c6aefdd3888d8d6005dd66d9ef5c2576,5
2019-11-21T22:40:30.726+0000,2944.0,0.6066350710900474,0.0,0.0,0.2608695652173913,0.0,1.0,0.0,1.0,0.7382897149524921,0.0909090909090909,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0045022511255627,0.7446808510638298,0.0309471709909346,0.652,0.4722222222222222,0.9473684210526316,0.0,1.0,0.0847457627118644,0.031719532554257,0.0204081632653061,0.75,0.0,0.0030010003334444,0.0036014405762304,0.5627556737878638,599.9833333333332,5,fc8e420058ea073a58debc64048ea686c6aefdd3888d8d6005dd66d9ef5c2576,5
2019-11-21T22:40:40.817+0000,2965.0,0.5876777251184834,0.0,0.0,0.2608695652173913,0.0,1.0,0.0,1.0,0.7382897149524921,0.0909090909090909,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0045022511255627,0.7446808510638298,0.0309471709909346,0.652,0.4722222222222222,0.9473684210526316,0.0,1.0,0.0847457627118644,0.031719532554257,0.0204081632653061,0.75,0.0,0.0030010003334444,0.0036014405762304,0.5627556737878638,599.9833333333332,5,fc8e420058ea073a58debc64048ea686c6aefdd3888d8d6005dd66d9ef5c2576,5


# Examination at cell and protocol level

## How many number of protocols tested on each cell?

In [6]:
display(all_DF.groupBy('cell#').agg(countDistinct("protocol"), collect_set("protocol")).orderBy("cell#"))

cell#,count(protocol),collect_set(protocol)
1,1,List(140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029)
3,5,"List(3c0dc4773fd8e5a688c248825e7a34367e8e5e8f8befd9b8211c6c387d22e116, 8e01bf0c25fbfb5416f4a1bece6392e1cbf8b1e356b144389531e946c27f6437, e4615c5798e4279178bd1cfde95118076e87e25239e39b43291a6356b351bc37, 84731643bd512e8095f1428a4275ed6e77b50eb9cc30cafd687112b37a25d711, 140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029)"
4,3,"List(84731643bd512e8095f1428a4275ed6e77b50eb9cc30cafd687112b37a25d711, f38daef78503f7c81cef066904fe29c4b2acf6acd96a0153230d80f21cd0905d, 140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029)"
5,7,"List(e85e6a6ab35f0bb2dea14e02ec68693d072c8d23e1323372a1c33cb95de06fe3, 3c0dc4773fd8e5a688c248825e7a34367e8e5e8f8befd9b8211c6c387d22e116, f94a27ee4db290ffb8d001b3c38117e1cdc43e6815b40d91cb62ff581abf6b48, e4615c5798e4279178bd1cfde95118076e87e25239e39b43291a6356b351bc37, fc8e420058ea073a58debc64048ea686c6aefdd3888d8d6005dd66d9ef5c2576, 09942314d31dd2553f1e7f827d9e57ce8d811a8b9b7d8fe75fd372c4910b06db, f38daef78503f7c81cef066904fe29c4b2acf6acd96a0153230d80f21cd0905d)"
6,6,"List(e85e6a6ab35f0bb2dea14e02ec68693d072c8d23e1323372a1c33cb95de06fe3, 8e01bf0c25fbfb5416f4a1bece6392e1cbf8b1e356b144389531e946c27f6437, 846b5e27147c9578f42ca206a07dc88943471d40792009223c2483a0e46964ff, f94a27ee4db290ffb8d001b3c38117e1cdc43e6815b40d91cb62ff581abf6b48, 01fd02718c6b7aa23bdfb4c39aee1e7bd4355d4ff22bc01e752f2a1ba82d7b3c, f38daef78503f7c81cef066904fe29c4b2acf6acd96a0153230d80f21cd0905d)"
7,1,List(01fd02718c6b7aa23bdfb4c39aee1e7bd4355d4ff22bc01e752f2a1ba82d7b3c)
9,4,"List(f89b0ae3c67c80e30c07089eb90ab50ad0c817d39f05861fa9d9962ff45647d5, cfa5974c7523fda0f6a27de37fb58a02d6281df22714f3cc73c24d5a45f674d1, 6ab1b145d36867fb9afcce0d16c1f42c48d4153f763b94e3bc1c2c9a7212b3ee, 0ee15df0e1233198be555cea609b7726e1dc914dfa3664a9ed67eca4dd6fb625)"
10,1,List(3c0dc4773fd8e5a688c248825e7a34367e8e5e8f8befd9b8211c6c387d22e116)
11,2,"List(5c766564ef71e08d02be2a27c64853040eebc414195eecec918b4e3465e90d6b, f36836ee54508cbbc99e07d64683255a11bb1f3176c56bdd80bba3a09d68f8ab)"
13,1,List(0622d7ec92353d2cb635403db5f450c3f2b625ab2f2926661d687cf768ba8d4e)


## How many cycles are recorded for each cell-protocol combination?

In [8]:
panel_DF = all_DF.groupBy('cell#', 'protocol').agg(countDistinct("cycle")).orderBy("cell#", "protocol").cache()
display(panel_DF)

cell#,protocol,count(cycle)
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,17
3,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,29
3,3c0dc4773fd8e5a688c248825e7a34367e8e5e8f8befd9b8211c6c387d22e116,4
3,84731643bd512e8095f1428a4275ed6e77b50eb9cc30cafd687112b37a25d711,36
3,8e01bf0c25fbfb5416f4a1bece6392e1cbf8b1e356b144389531e946c27f6437,10
3,e4615c5798e4279178bd1cfde95118076e87e25239e39b43291a6356b351bc37,162
4,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,27
4,84731643bd512e8095f1428a4275ed6e77b50eb9cc30cafd687112b37a25d711,35
4,f38daef78503f7c81cef066904fe29c4b2acf6acd96a0153230d80f21cd0905d,673
5,09942314d31dd2553f1e7f827d9e57ce8d811a8b9b7d8fe75fd372c4910b06db,1


## How many total cycles are performed on a each cell?

In [10]:
display(panel_DF.groupBy('cell#').agg(sum("count(cycle)").alias("cell_total_cycles")).orderBy("cell#"))

cell#,cell_total_cycles
1,17
3,241
4,735
5,91
6,1662
7,1223
9,491
10,345
11,65
13,47


## How many total cycles are performed for a each protocol?

In [12]:
display(panel_DF.groupBy('protocol').agg(sum("count(cycle)").alias("protocol_total_cycles")).orderBy("protocol"))

protocol,protocol_total_cycles
01fd02718c6b7aa23bdfb4c39aee1e7bd4355d4ff22bc01e752f2a1ba82d7b3c,1234
0622d7ec92353d2cb635403db5f450c3f2b625ab2f2926661d687cf768ba8d4e,47
09942314d31dd2553f1e7f827d9e57ce8d811a8b9b7d8fe75fd372c4910b06db,1
0ee15df0e1233198be555cea609b7726e1dc914dfa3664a9ed67eca4dd6fb625,50
140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,73
2683681f89ae95b058300629399b1bf3a89afa82e8a7588551b3fd1ce0155a99,2
3c0dc4773fd8e5a688c248825e7a34367e8e5e8f8befd9b8211c6c387d22e116,359
3d35a4d150a34c742054791a565a9b416c774d783eb4c7aae54a1952290c11c6,2454
5c766564ef71e08d02be2a27c64853040eebc414195eecec918b4e3465e90d6b,99
5f2ed209ddb21e76d6889ddee126c3df653f11488da0bc8ee9a641ba33a8155b,53


## In total, how many cycles are recored in the data?

In [14]:
display(panel_DF.groupBy('protocol').agg(sum("count(cycle)").alias("protocol_total_cycles")).orderBy("protocol").select(sum("protocol_total_cycles").alias("total_cycles")))

total_cycles
8354


In [15]:
#remove DF from memory
panel_DF.unpersist()

# Examination of features

## what type of features do we have? How many unique values can they take?
### Binary features: 16;
### Ternary features: 6;
### Constant feature: 1;

In [18]:
distinct_cnt_DF = all_DF.agg(*(countDistinct(col(c)).alias(c) for c in all_DF.columns)).cache()
display(distinct_cnt_DF)

time,ocv,di,i0x2d,i0xc4,i0x91,i0x81,i0x40,i0x32,i0x65,i0x2,i0xbc,i0x30,i0x9f,i0x6b,i0x9,i0x8f,i0x3b,i0xc9,i0xb2,i0x14,i0x76,i0x29,i0x2c,i0xcd,i0x28,i0xb1,i0x83,i0x8c,i0x6,i0x5a,i0x78,i0xa7,i0x2a,i0x8a,i0xb6,i0x5,i0x94,i0x73,average_di,charge_duration,cell#,protocol,cycle
1275382,1214,211,2,3,18,2,2,3,2,259,3,2,2,2,2,1,2,2,2,2,2,2,2,70,12,16980,17,16,20,3,2,5,6,9,3,3,70,72,8043,394,20,28,1278


### list all types of features

In [20]:
bin_ftr = [key for (key,value) in distinct_cnt_DF.collect()[0].asDict().items() if value == 2]
ter_ftr = [key for (key,value) in distinct_cnt_DF.collect()[0].asDict().items() if value == 3]
uni_ftr = [key for (key,value) in distinct_cnt_DF.collect()[0].asDict().items() if value == 1]
print('Binary features:', bin_ftr,'count:',len(bin_ftr), '\n')
print('Ternary features:', ter_ftr,'count:',len(ter_ftr), '\n')
print('Constant features:', uni_ftr,'count:',len(uni_ftr), '\n')

In [21]:
distinct_cnt_DF.unpersist()

## What disctinct values each of the features have in the data?

In [23]:
#only interested in the unique values for charging parameters 
col_i0 = [c for c in all_DF.columns if c not in {'time','di','ocv','average_di', 'charge_duration',  'cycle', 'cell#', 'protocol'}]

all_DF.select([collect_set(c) for c in col_i0]).toPandas()

Unnamed: 0,collect_set(i0x2d),collect_set(i0xc4),collect_set(i0x91),collect_set(i0x81),collect_set(i0x40),collect_set(i0x32),collect_set(i0x65),collect_set(i0x2),collect_set(i0xbc),collect_set(i0x30),collect_set(i0x9f),collect_set(i0x6b),collect_set(i0x9),collect_set(i0x8f),collect_set(i0x3b),collect_set(i0xc9),collect_set(i0xb2),collect_set(i0x14),collect_set(i0x76),collect_set(i0x29),collect_set(i0x2c),collect_set(i0xcd),collect_set(i0x28),collect_set(i0xb1),collect_set(i0x83),collect_set(i0x8c),collect_set(i0x6),collect_set(i0x5a),collect_set(i0x78),collect_set(i0xa7),collect_set(i0x2a),collect_set(i0x8a),collect_set(i0xb6),collect_set(i0x5),collect_set(i0x94),collect_set(i0x73)
0,"[0.0, 1.0]","[0.0, 1.0, 0.42857142857142855]","[0.39130434782608703, 0.6086956521739131, 0.47...","[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0, 0.5]","[0.0, 1.0]","[0.0, 0.5000833472245374, 0.5757626271045174, ...","[0.0, 0.09090909090909093, 1.0]","[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0]",[0.0],"[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0]","[0.0, 1.0]","[0.0, 0.034517258629314665, 0.0006503251625812...","[0.0, 0.9574468085106383, 0.851063829787234, 1...","[0.0, 0.4480150046889653, 0.2406064395123476, ...","[0.0, 1.0, 0.44, 0.652, 0.632, 0.6920000000000...","[0.0, 1.0, 0.9305555555555556, 0.9402777777777...","[0.0, 0.917464114832536, 0.9138755980861244, 0...","[0.0, 0.75, 1.0]","[0.0, 1.0]","[0.0, 0.0847457627118644, 1.0, 0.8305084745762...","[0.0, 1.0, 0.0651085141903172, 0.0484140233722...","[0.0, 0.020408163265306117, 0.0244897959183673...","[0.0, 0.75, 1.0]","[0.0, 1.0, 0.5]","[0.0, 0.008002667555851951, 0.0225408469489829...","[0.0, 0.0156062424969988, 0.025610244097639057..."


## Which of the features remain constant for a protocol?
###features i0x30, i0x9f, and i0x2c remain constant for a cell-protocol combination

In [25]:
#count the number of distinct values of features for each cell-protocol combination
dst_cnt_cell_protocol_DF = all_DF.groupBy('cell#', 'protocol').agg(*[countDistinct(c) for c in col_i0]).orderBy("cell#", "protocol").cache()
display(dst_cnt_cell_protocol_DF)

cell#,protocol,count(i0x2d),count(i0xc4),count(i0x91),count(i0x81),count(i0x40),count(i0x32),count(i0x65),count(i0x2),count(i0xbc),count(i0x30),count(i0x9f),count(i0x6b),count(i0x9),count(i0x8f),count(i0x3b),count(i0xc9),count(i0xb2),count(i0x14),count(i0x76),count(i0x29),count(i0x2c),count(i0xcd),count(i0x28),count(i0xb1),count(i0x83),count(i0x8c),count(i0x6),count(i0x5a),count(i0x78),count(i0xa7),count(i0x2a),count(i0x8a),count(i0xb6),count(i0x5),count(i0x94),count(i0x73)
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,17,1,1,2,2,1,1,1,1,3,2,2,1,18,17
3,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,1,1,1,1,1,1,1,20,1,1,1,1,1,1,1,1,1,1,1,1,1,9,1,2,2,2,1,1,2,1,2,3,1,1,10,10
3,3c0dc4773fd8e5a688c248825e7a34367e8e5e8f8befd9b8211c6c387d22e116,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,2,1,2,1,1,1,1,1,1,1,1,2,2,2,2
3,84731643bd512e8095f1428a4275ed6e77b50eb9cc30cafd687112b37a25d711,1,1,1,1,1,1,1,21,1,1,1,1,1,1,1,1,1,1,1,1,1,14,1,1,2,2,1,1,2,1,2,2,1,1,15,14
3,8e01bf0c25fbfb5416f4a1bece6392e1cbf8b1e356b144389531e946c27f6437,1,2,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1
3,e4615c5798e4279178bd1cfde95118076e87e25239e39b43291a6356b351bc37,2,1,5,1,2,1,2,27,2,1,1,1,1,1,2,1,1,1,1,1,1,2,2,7,3,4,5,1,2,1,2,2,2,1,2,2
4,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,1,1,1,1,1,1,2,18,1,1,1,1,1,1,1,1,1,1,1,1,1,8,1,1,2,2,2,1,2,1,2,2,1,1,8,8
4,84731643bd512e8095f1428a4275ed6e77b50eb9cc30cafd687112b37a25d711,1,1,1,1,1,1,1,22,1,1,1,1,1,1,1,1,1,1,1,1,1,9,2,2,3,3,2,1,2,2,3,3,2,1,10,10
4,f38daef78503f7c81cef066904fe29c4b2acf6acd96a0153230d80f21cd0905d,1,1,1,1,1,1,2,50,2,1,1,1,1,1,2,1,1,1,1,1,1,1,2,2,3,2,2,1,1,1,3,3,2,1,2,2
5,09942314d31dd2553f1e7f827d9e57ce8d811a8b9b7d8fe75fd372c4910b06db,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [26]:
#find which features have only one unique value for a cell_protocol combination
dst_cnt_cell_protocol_DF.select(*[collect_set(c) for c in dst_cnt_cell_protocol_DF.columns]).toPandas()

Unnamed: 0,collect_set(cell#),collect_set(protocol),collect_set(count(i0x2d)),collect_set(count(i0xc4)),collect_set(count(i0x91)),collect_set(count(i0x81)),collect_set(count(i0x40)),collect_set(count(i0x32)),collect_set(count(i0x65)),collect_set(count(i0x2)),collect_set(count(i0xbc)),collect_set(count(i0x30)),collect_set(count(i0x9f)),collect_set(count(i0x6b)),collect_set(count(i0x9)),collect_set(count(i0x8f)),collect_set(count(i0x3b)),collect_set(count(i0xc9)),collect_set(count(i0xb2)),collect_set(count(i0x14)),collect_set(count(i0x76)),collect_set(count(i0x29)),collect_set(count(i0x2c)),collect_set(count(i0xcd)),collect_set(count(i0x28)),collect_set(count(i0xb1)),collect_set(count(i0x83)),collect_set(count(i0x8c)),collect_set(count(i0x6)),collect_set(count(i0x5a)),collect_set(count(i0x78)),collect_set(count(i0xa7)),collect_set(count(i0x2a)),collect_set(count(i0x8a)),collect_set(count(i0xb6)),collect_set(count(i0x5)),collect_set(count(i0x94)),collect_set(count(i0x73))
0,"[9, 1, 24, 3, 18, 10, 4, 11, 26, 27, 19, 13, 5...",[e85e6a6ab35f0bb2dea14e02ec68693d072c8d23e1323...,"[1, 2]","[1, 2]","[1, 5, 2, 6, 3, 4]","[1, 2]","[1, 2]","[1, 2, 3]","[1, 2]","[27, 1, 20, 2, 50, 42, 21, 18, 22, 8, 23]","[1, 2]",[1],[1],"[1, 2]","[1, 2]",[1],"[1, 2]","[1, 2]","[1, 2]","[1, 2]","[1, 2]","[1, 2]",[1],"[15, 9, 1, 13, 2, 17, 50, 3, 10, 14, 11, 8]","[1, 2, 3]","[706, 2121, 1, 611, 1403, 10767, 205, 2, 286, ...","[1, 2, 3, 4]","[1, 5, 2, 3, 4]","[1, 5, 2, 3, 4]","[1, 2]","[1, 2]","[1, 2, 3]","[1, 2, 3]","[1, 2, 3]","[1, 2]","[1, 2]","[15, 1, 13, 2, 50, 3, 18, 10, 11, 8]","[15, 12, 1, 13, 2, 17, 50, 3, 10, 14, 8]"


In [27]:
#remove DF from memory
dst_cnt_cell_protocol_DF.unpersist()

## Which features remain constant for a protocol?
###features i0x9f and i0x2c remains constant for a given protocol

In [29]:
#count the number of distinct values of features for each protocol
dst_cnt_protocol_DF = all_DF.groupBy('protocol').agg(*[countDistinct(c) for c in col_i0]).orderBy('protocol').cache()
display(dst_cnt_protocol_DF)

protocol,count(i0x2d),count(i0xc4),count(i0x91),count(i0x81),count(i0x40),count(i0x32),count(i0x65),count(i0x2),count(i0xbc),count(i0x30),count(i0x9f),count(i0x6b),count(i0x9),count(i0x8f),count(i0x3b),count(i0xc9),count(i0xb2),count(i0x14),count(i0x76),count(i0x29),count(i0x2c),count(i0xcd),count(i0x28),count(i0xb1),count(i0x83),count(i0x8c),count(i0x6),count(i0x5a),count(i0x78),count(i0xa7),count(i0x2a),count(i0x8a),count(i0xb6),count(i0x5),count(i0x94),count(i0x73)
01fd02718c6b7aa23bdfb4c39aee1e7bd4355d4ff22bc01e752f2a1ba82d7b3c,2,2,2,1,1,3,2,2,2,1,1,2,2,1,2,2,2,1,1,2,1,3,2,2,3,3,4,1,2,2,2,2,2,1,3,3
0622d7ec92353d2cb635403db5f450c3f2b625ab2f2926661d687cf768ba8d4e,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1
09942314d31dd2553f1e7f827d9e57ce8d811a8b9b7d8fe75fd372c4910b06db,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0ee15df0e1233198be555cea609b7726e1dc914dfa3664a9ed67eca4dd6fb625,1,1,1,2,1,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,50,2,1,1,2,1,1,1,1,1,1,1,1,50,50
140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,2,1,1,1,1,1,2,34,1,1,1,1,1,1,1,1,1,1,1,1,1,18,1,2,2,2,2,1,2,1,3,3,2,1,19,19
2683681f89ae95b058300629399b1bf3a89afa82e8a7588551b3fd1ce0155a99,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,286,2,2,2,1,1,1,1,2,1,1,1,1
3c0dc4773fd8e5a688c248825e7a34367e8e5e8f8befd9b8211c6c387d22e116,1,2,1,1,1,3,2,1,1,2,1,1,2,1,1,2,1,1,2,2,1,3,3,3,2,1,4,1,2,2,1,2,3,2,2,3
3d35a4d150a34c742054791a565a9b416c774d783eb4c7aae54a1952290c11c6,2,2,2,1,2,2,2,2,2,1,1,2,2,1,2,2,2,1,2,2,1,2,2,2,2,2,2,1,1,2,2,2,2,1,2,2
5c766564ef71e08d02be2a27c64853040eebc414195eecec918b4e3465e90d6b,1,1,3,1,2,3,2,2,2,1,1,2,1,1,2,1,1,2,2,2,1,3,2,3897,1,1,3,1,1,1,1,2,2,2,3,3
5f2ed209ddb21e76d6889ddee126c3df653f11488da0bc8ee9a641ba33a8155b,1,1,3,1,1,2,2,2,2,1,1,1,1,1,2,1,1,1,1,1,1,3,2,2121,1,1,2,1,1,1,1,2,2,2,3,3


In [30]:
#find which features have only one unique value for a protocol
dst_cnt_protocol_DF.select(*[collect_set(c) for c in dst_cnt_protocol_DF.columns]).toPandas()

Unnamed: 0,collect_set(protocol),collect_set(count(i0x2d)),collect_set(count(i0xc4)),collect_set(count(i0x91)),collect_set(count(i0x81)),collect_set(count(i0x40)),collect_set(count(i0x32)),collect_set(count(i0x65)),collect_set(count(i0x2)),collect_set(count(i0xbc)),collect_set(count(i0x30)),collect_set(count(i0x9f)),collect_set(count(i0x6b)),collect_set(count(i0x9)),collect_set(count(i0x8f)),collect_set(count(i0x3b)),collect_set(count(i0xc9)),collect_set(count(i0xb2)),collect_set(count(i0x14)),collect_set(count(i0x76)),collect_set(count(i0x29)),collect_set(count(i0x2c)),collect_set(count(i0xcd)),collect_set(count(i0x28)),collect_set(count(i0xb1)),collect_set(count(i0x83)),collect_set(count(i0x8c)),collect_set(count(i0x6)),collect_set(count(i0x5a)),collect_set(count(i0x78)),collect_set(count(i0xa7)),collect_set(count(i0x2a)),collect_set(count(i0x8a)),collect_set(count(i0xb6)),collect_set(count(i0x5)),collect_set(count(i0x94)),collect_set(count(i0x73))
0,[e85e6a6ab35f0bb2dea14e02ec68693d072c8d23e1323...,"[1, 2]","[1, 2]","[1, 5, 2, 3, 8]","[1, 2]","[1, 2]","[1, 2, 3]","[1, 2]","[45, 1, 34, 37, 2, 28, 40, 8, 92]","[1, 2]","[1, 2]",[1],"[1, 2]","[1, 2]",[1],"[1, 2]","[1, 2]","[1, 2]","[1, 2]","[1, 2]","[1, 2]",[1],"[15, 1, 13, 2, 50, 3, 18, 10, 14]","[1, 2, 3]","[2121, 1, 2, 286, 1955, 10826, 3, 1977, 1523, ...","[1, 2, 3, 4]","[1, 2, 6, 3, 4]","[1, 5, 2, 3, 4]","[1, 2]","[1, 2]","[1, 2, 3]","[1, 2, 3]","[1, 2, 3]","[1, 2, 3]","[1, 2]","[15, 1, 19, 13, 2, 50, 3, 11]","[15, 1, 19, 2, 50, 3, 10, 14]"


In [31]:
#remove DF from memeory
dst_cnt_protocol_DF.unpersist()

## Which of the features remain constant during a cycle?

### features constant in a cycle (total 19): 
###['i0x2d', 'i0xc4', 'i0x81', 'i0x40', 'i0x32', 'i0xbc', 'i0x30', 'i0x9f', 'i0x6b', 'i0x9', 'i0x8f', 'i0x3b', 'i0xc9', 'i0xb2', 'i0x14', 'i0x76', 'i0x29', 'i0x2c', 'i0x5'] 
### (16 cycle specific, 3 from above which also remain constant for cell-protocol combination)

In [34]:
#count the number of distinct values of features for each cycle
dst_cnt_cycle_DF = all_DF.groupBy('cell#', 'protocol', 'cycle').agg(*[countDistinct(c).alias(c) for c in col_i0]).orderBy("cell#", "protocol", "cycle").cache()
display(dst_cnt_cycle_DF.limit(10))

cell#,protocol,cycle,i0x2d,i0xc4,i0x91,i0x81,i0x40,i0x32,i0x65,i0x2,i0xbc,i0x30,i0x9f,i0x6b,i0x9,i0x8f,i0x3b,i0xc9,i0xb2,i0x14,i0x76,i0x29,i0x2c,i0xcd,i0x28,i0xb1,i0x83,i0x8c,i0x6,i0x5a,i0x78,i0xa7,i0x2a,i0x8a,i0xb6,i0x5,i0x94,i0x73
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,1,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,5,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,1,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,7,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,10,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,2,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,11,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,12,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,1,1
1,140f77741820c02177597651dfea9fe881c1a73d8e4002a87d0148967cc0f029,13,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,1,1,1,1,2,2,1,1,1,1


In [35]:
#find which features have only one unique value for a cycle
dst_cnt_cycle_DF.select(*[collect_set(c) for c in dst_cnt_cycle_DF.columns]).toPandas()

Unnamed: 0,collect_set(cell#),collect_set(protocol),collect_set(cycle),collect_set(i0x2d),collect_set(i0xc4),collect_set(i0x91),collect_set(i0x81),collect_set(i0x40),collect_set(i0x32),collect_set(i0x65),collect_set(i0x2),collect_set(i0xbc),collect_set(i0x30),collect_set(i0x9f),collect_set(i0x6b),collect_set(i0x9),collect_set(i0x8f),collect_set(i0x3b),collect_set(i0xc9),collect_set(i0xb2),collect_set(i0x14),collect_set(i0x76),collect_set(i0x29),collect_set(i0x2c),collect_set(i0xcd),collect_set(i0x28),collect_set(i0xb1),collect_set(i0x83),collect_set(i0x8c),collect_set(i0x6),collect_set(i0x5a),collect_set(i0x78),collect_set(i0xa7),collect_set(i0x2a),collect_set(i0x8a),collect_set(i0xb6),collect_set(i0x5),collect_set(i0x94),collect_set(i0x73)
0,"[9, 1, 24, 3, 18, 10, 4, 11, 26, 27, 19, 13, 5...",[e85e6a6ab35f0bb2dea14e02ec68693d072c8d23e1323...,"[843, 356, 437, 793, 306, 387, 743, 256, 206, ...",[1],[1],"[1, 2]",[1],[1],[1],"[1, 2]","[1, 2]",[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],[1],"[1, 2]","[1, 2, 3]","[102, 52, 2, 83, 156, 106, 98, 77, 48, 27, 121...","[1, 2, 3, 4]","[1, 2, 3, 4]","[1, 2, 3]","[1, 2]","[1, 2]","[1, 2]","[1, 2]","[1, 2]","[1, 2]",[1],"[1, 2]","[1, 2]"


In [36]:
#create a list of features that remain constant within a cycle
const_in_cycle = [key for (key,value) in dst_cnt_cycle_DF.select(*[collect_set(c).alias(c) for c in dst_cnt_cycle_DF.columns]).collect()[0].asDict().items() if value == [1]]

#list the names of features that are constant within a cycle
print('features constant in a cycle:', const_in_cycle)

In [37]:
#remove the DF from memory
dst_cnt_cycle_DF.unpersist()
all_DF.unpersist()