In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder


from user_definition import *

# 1. Load data

In [2]:
ss = SparkSession.builder.config('spark.driver.extraClassPath',
                                 'postgresql-42.2.18.jar')\
                         .config("spark.driver.memory", "15g")\
                         .config("spark.executor.memory", "8g")\
                         .getOrCreate()
sc = ss.sparkContext

## 1.1. Read activity_code file from AWS RDS

In [3]:
activity_code = ss.read.jdbc(url=url, table=table, properties=properties)

In [4]:
activity_code.show()

+--------------------+----+
|            activity|code|
+--------------------+----+
|             Walking|   A|
|             Jogging|   B|
|              Stairs|   C|
|             Sitting|   D|
|            Standing|   E|
|              Typing|   F|
|      Brushing Teeth|   G|
|         Eating Soup|   H|
|        Eating Chips|   I|
|        Eating Pasta|   J|
|   Drinking from Cup|   K|
|     Eating Sandwich|   L|
| Kicking Soccer Ball|   M|
|Playing Catch w/T...|   O|
|Dribblinlg Basket...|   P|
|             Writing|   Q|
|            Clapping|   R|
|     Folding Clothes|   S|
+--------------------+----+



## 1.2. Read sensor files from local machine

In [5]:
files_rdd = file_rdd(ss, files)

schema = StructType([ StructField("subject_id", IntegerType(), False),
                      StructField("sensor", StringType(), False),
                      StructField("device", StringType(), False),
                      StructField("activity_code", StringType(), False),
                      StructField("timestamp", LongType(), False),
                      StructField("x", FloatType(), False),
                      StructField("y", FloatType(), False),
                      StructField("z", FloatType(), False)
                    ])

df_record = create_activity_df(ss, files_rdd, schema)

In [6]:
df_record.show()



+----------+------+------+-------------+---------------+------------+------------+------------+
|subject_id|sensor|device|activity_code|      timestamp|           x|           y|           z|
+----------+------+------+-------------+---------------+------------+------------+------------+
|      1613|  gyro| phone|            A|178468071944614|-0.020240024|-0.004261058|-0.023435818|
|      1613|  gyro| phone|            A|178468104194617|  -2.5750105|  0.18109496|   1.3864417|
|      1613|  gyro| phone|            A|178468142811857|  -1.5739282|   0.6668556|    1.320928|
|      1613|  gyro| phone|            A|178468183987271|  -1.5041534|   1.7973675|    0.824781|
|      1613|  gyro| phone|            A|178468225406856| -0.50786483|   1.6002935|  0.45833004|
|      1613|  gyro| phone|            A|178468263750919|   0.8072041|   1.4295849|    0.406931|
|      1613|  gyro| phone|            A|178468303909407|   2.7057717|   1.1065434|  0.22610238|
|      1613|  gyro| phone|            A|

In [7]:
def f2(x):
    determine = False
    x_lower = x.lower()
    for s in eating_strings:
        if s in x_lower:
            determine = True
    return determine
            
check_activity_w_eating = udf(f2, BooleanType())

# 2. Reture the code with include eating string

In [8]:
activity_code_w_eating = activity_code.filter(check_activity_w_eating('activity'))\
                                      .select('code').distinct().orderBy('code')
    
activity_code_w_eating.show()

+----+
|code|
+----+
|   H|
|   I|
|   J|
|   L|
+----+



# 3. Create target variable which is equal to 1 if the code includes "eating" otherwise 0

In [9]:
df_record_activity = df_record.join(activity_code, df_record.activity_code == activity_code.code)
cols_repartition_3 = ['subject_id', 'timestamp', 'device', 'sensor']

df_record_activity = df_record_activity.repartition(*cols_repartition_3).cache()
df_record_activity.show()

+----------+------+------+-------------+---------------+-------------+-------------+-------------+-----------------+----+
|subject_id|sensor|device|activity_code|      timestamp|            x|            y|            z|         activity|code|
+----------+------+------+-------------+---------------+-------------+-------------+-------------+-----------------+----+
|      1613|  gyro| phone|            K|175770089551403| -0.009321064|  -0.00479369|-0.0026631611|Drinking from Cup|   K|
|      1613|  gyro| phone|            K|175771247834215| -0.006657903| -0.008255799|  -0.00479369|Drinking from Cup|   K|
|      1613|  gyro| phone|            K|175781381711294| -0.010386328|-0.0031957934|-7.9894834E-4|Drinking from Cup|   K|
|      1613|  gyro| phone|            K|175789282129781|-0.0053263223| -0.005060006| 0.0010652645|Drinking from Cup|   K|
|      1613|  gyro| phone|            K|175796666590885| -0.006924219| -0.007190535|-0.0013315806|Drinking from Cup|   K|
|      1613|  gyro| phon

In [10]:
def f3(x):
    determine = 0
    x_lower = x.lower()
    for s in eating_strings:
        if s in x_lower:
            determine = 1
    return determine
            
binary_encoder = udf(f3, IntegerType())

In [11]:
cols_select_3 = ['subject_id', 'sensor', 'device', 'activity_code', 'timestamp',  'x', 'y', 'z' ]
cols_orderBy_3 = ['subject_id', 'timestamp', 'device', 'sensor']


In [12]:
df_record_w_eating = df_record_activity.select(*cols_select_3, binary_encoder('activity')).orderBy(*cols_orderBy_3)\
                  .withColumnRenamed('f3(activity)', 'eating')

df_record_w_eating.show(n)

+----------+------+------+-------------+--------------+----------+----------+----------+------+
|subject_id|sensor|device|activity_code|     timestamp|         x|         y|         z|eating|
+----------+------+------+-------------+--------------+----------+----------+----------+------+
|      1600| accel| watch|            D|79817308824838|-0.1666963| 1.5316905| 10.057592|     0|
|      1600| accel| watch|            D|79817358500488|  3.613748|-1.0540473| 11.779023|     0|
|      1600|  gyro| watch|            D|79817358500488| -1.365979|-1.5444704|-1.6969953|     0|
|      1600| accel| watch|            D|79817408176138| 2.0886416|-3.4386723|  12.97373|     0|
|      1600|  gyro| watch|            D|79817408176138|-1.9071333|-1.2696322|-1.8173702|     0|
+----------+------+------+-------------+--------------+----------+----------+----------+------+
only showing top 5 rows



In [13]:
df_record_w_eating.count()

15630426

In [14]:
df_record.count()

15630426

In [15]:
df_record_w_eating.printSchema()

root
 |-- subject_id: integer (nullable = false)
 |-- sensor: string (nullable = false)
 |-- device: string (nullable = false)
 |-- activity_code: string (nullable = false)
 |-- timestamp: long (nullable = false)
 |-- x: float (nullable = false)
 |-- y: float (nullable = false)
 |-- z: float (nullable = false)
 |-- eating: integer (nullable = true)



# 4. Create features for the coordinates for accelerometer and gyroscope

In [16]:
df_accel = df_record_w_eating.filter("sensor == 'accel'")\
                             .withColumnRenamed('x', 'accel_x')\
                             .withColumnRenamed('y', 'accel_y')\
                             .withColumnRenamed('z', 'accel_z')
df_accel.show(5)

+----------+------+------+-------------+--------------+----------+----------+---------+------+
|subject_id|sensor|device|activity_code|     timestamp|   accel_x|   accel_y|  accel_z|eating|
+----------+------+------+-------------+--------------+----------+----------+---------+------+
|      1600| accel| watch|            D|79817308824838|-0.1666963| 1.5316905|10.057592|     0|
|      1600| accel| watch|            D|79817358500488|  3.613748|-1.0540473|11.779023|     0|
|      1600| accel| watch|            D|79817408176138| 2.0886416|-3.4386723| 12.97373|     0|
|      1600| accel| watch|            D|79817457851788| 1.7319057|-2.5504234| 9.820566|     0|
|      1600| accel| watch|            D|79817507527438| 3.0319571|-2.7802668|14.082246|     0|
+----------+------+------+-------------+--------------+----------+----------+---------+------+
only showing top 5 rows



In [17]:
df_gyro = df_record_w_eating.filter("sensor == 'gyro'")\
                              .withColumnRenamed('x', 'gyro_x')\
                              .withColumnRenamed('y', 'gyro_y')\
                              .withColumnRenamed('z', 'gyro_z')
df_gyro.show(5)


+----------+------+------+-------------+--------------+-----------+-----------+------------+------+
|subject_id|sensor|device|activity_code|     timestamp|     gyro_x|     gyro_y|      gyro_z|eating|
+----------+------+------+-------------+--------------+-----------+-----------+------------+------+
|      1600|  gyro| watch|            D|79817358500488|  -1.365979| -1.5444704|  -1.6969953|     0|
|      1600|  gyro| watch|            D|79817408176138| -1.9071333| -1.2696322|  -1.8173702|     0|
|      1600|  gyro| watch|            D|79817457851788| -1.4416127|-0.42487752| -0.28658515|     0|
|      1600|  gyro| watch|            D|79817507527438|  1.1480451| -1.4720324|  0.23645967|     0|
|      1600|  gyro| watch|            D|79817557203088|-0.67249185|0.048099883|-0.059683837|     0|
+----------+------+------+-------------+--------------+-----------+-----------+------------+------+
only showing top 5 rows



In [18]:
cond = (df_accel.activity_code == df_gyro.activity_code)\
        & (df_accel.device == df_gyro.device)\
        & (df_accel.timestamp == df_gyro.timestamp)

cols_rearrange_4= ['activity_code', 'subject_id', 'timestamp', 'device', 'eating', 'accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z']

df_same_activity_device_time = df_accel.join(df_gyro, cond).distinct()
df_same_activity_device_time.count()

5901089

In [19]:
df_same_ag = df_same_activity_device_time.drop(df_gyro.subject_id)\
                                        .drop(df_gyro.activity_code)\
                                        .drop(df_gyro.device)\
                                        .drop(df_gyro.eating)\
                                        .drop(df_gyro.timestamp)\
                                        .select(cols_rearrange_4)

df_same_ag.show(5)

+-------------+----------+-------------+------+------+-----------+----------+----------+-----------+----------+-----------+
|activity_code|subject_id|    timestamp|device|eating|    accel_x|   accel_y|   accel_z|     gyro_x|    gyro_y|     gyro_z|
+-------------+----------+-------------+------+------+-----------+----------+----------+-----------+----------+-----------+
|            A|      1623|2520873744873| phone|     0| -2.7174225|-2.9451294| 0.4654541| 0.65600586|-0.7966461| -0.4618988|
|            A|      1623|2525808428600| phone|     0|   4.735489| -13.04805|-1.6248474|   -1.61586|-0.5095062|-0.42541504|
|            A|      1623|2529887102917| phone|     0|-0.13652039| -9.755417|-1.0368042|  1.0764465|  0.411911| 0.55474854|
|            A|      1623|2541569242466| phone|     0|  2.7370605| -11.87529|-1.1391144|  1.6962128| 1.2458191|  0.1650238|
|            A|      1623|2546604630192| phone|     0| -1.4176025|-3.7265472|  0.475235|-0.96173096|-1.7243805| -0.5609436|
+-------

# 5. Create feature for the coordinate readings which are 1-5 rows after the current row

In [20]:
cols_partitionBy_5 = ['subject_id', 'activity_code', 'device']
cols_orderBy_5 = ['subject_id', 'activity_code', 'device', 'timestamp']

windowsSpec = Window.partitionBy(cols_partitionBy_5).orderBy(cols_orderBy_5)

In [21]:
cols_orderBy_5 = ['subject_id', 'activity_code', 'device', 'timestamp']

current_ord_columns = df_same_ag.columns[-6:]

In [22]:
for i in range(1, window_size+1):
    for col in current_ord_columns:
        df_same_ag = df_same_ag.withColumn('lead_'+f'{i}'+f'_{col}', lead(col, i).over(windowsSpec))
        
df_same_ag = df_same_ag.orderBy(cols_orderBy_5)
df_same_ag.show(n)

+-------------+----------+---------------+------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+
|activity_code|subject_id|      timestamp|device|eating|    accel_x|  accel_y|    accel_z|     gyro_x|     gyro_y|     gyro_z|lead_1_accel_x|lead_1_accel_y|lead_1_accel_z|lead_1_gyro_x|lead_1_gyro_y|lead_1_gyro_z|lead_2_accel_x|lead_2_accel_y|lead_2_accel_z|lead_2_gyro_x|lead_2_gyro_y|lead_2_gyro_z|lead_3_accel_x|lead_3_accel_y|lead_3_accel_z|lead_3_gyro_x|lead_3_gyro_y|lead_3_gyro_z|
+-------------+----------+---------------+------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+-------------+-------------+-------------+-----------

In [23]:
df_same_ag_w_lead = df_same_ag.drop('activity_code')

df_same_ag_w_lead.show(n)

+----------+---------------+------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+
|subject_id|      timestamp|device|eating|    accel_x|  accel_y|    accel_z|     gyro_x|     gyro_y|     gyro_z|lead_1_accel_x|lead_1_accel_y|lead_1_accel_z|lead_1_gyro_x|lead_1_gyro_y|lead_1_gyro_z|lead_2_accel_x|lead_2_accel_y|lead_2_accel_z|lead_2_gyro_x|lead_2_gyro_y|lead_2_gyro_z|lead_3_accel_x|lead_3_accel_y|lead_3_accel_z|lead_3_gyro_x|lead_3_gyro_y|lead_3_gyro_z|
+----------+---------------+------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+--------

# 6. One-hot encode device feature

In [24]:
def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

dfnumeric = indexStringColumns(df_same_ag, ["device"])

In [25]:
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        ohe = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        ohe_model = ohe.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = ohe_model.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["device"])  

In [26]:
dfhot_columns = dfhot.columns[1:3] + ['device'] + dfhot.columns[4:-1]
cols_orderBy_6 = ['subject_id', 'timestamp', 'device']


In [27]:
dfhot_rearrange = dfhot.orderBy(cols_orderBy_6).select(dfhot_columns)

dfhot_rearrange.show(n)

+----------+--------------+-------------+---------+----------+---------+-----------+-----------+------------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+
|subject_id|     timestamp|       device|  accel_x|   accel_y|  accel_z|     gyro_x|     gyro_y|      gyro_z|lead_1_accel_x|lead_1_accel_y|lead_1_accel_z|lead_1_gyro_x|lead_1_gyro_y|lead_1_gyro_z|lead_2_accel_x|lead_2_accel_y|lead_2_accel_z|lead_2_gyro_x|lead_2_gyro_y|lead_2_gyro_z|lead_3_accel_x|lead_3_accel_y|lead_3_accel_z|lead_3_gyro_x|lead_3_gyro_y|lead_3_gyro_z|
+----------+--------------+-------------+---------+----------+---------+-----------+-----------+------------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+---

# 7. Create a feature column which assembles the coordinate features

In [28]:
input_cols_va = dfhot.columns[4:-1]

In [29]:
va = VectorAssembler(outputCol="features", inputCols=input_cols_va, handleInvalid='skip')
lpoints = va.transform(dfhot)
lpoints.show(n)

+-------------+----------+---------------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+-------------+--------------------+
|activity_code|subject_id|      timestamp|eating|    accel_x|  accel_y|    accel_z|     gyro_x|     gyro_y|     gyro_z|lead_1_accel_x|lead_1_accel_y|lead_1_accel_z|lead_1_gyro_x|lead_1_gyro_y|lead_1_gyro_z|lead_2_accel_x|lead_2_accel_y|lead_2_accel_z|lead_2_gyro_x|lead_2_gyro_y|lead_2_gyro_z|lead_3_accel_x|lead_3_accel_y|lead_3_accel_z|lead_3_gyro_x|lead_3_gyro_y|lead_3_gyro_z|       device|            features|
+-------------+----------+---------------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+----

In [30]:
def standard_scaler(input_df):
    df = input_df
  
    scaler = StandardScaler(inputCol="features", outputCol="features_scaled", withMean=True, withStd=True)

    standard_scaler_transformer = scaler.fit(df)

    # Normalize each feature to have unit standard deviation.
    df = standard_scaler_transformer.transform(df).drop("features")
    df = df.withColumnRenamed("features_scaled", "features")
    return df


lpoints_scaled = standard_scaler(lpoints).cache()

In [31]:
lpoints_scaled.show(n)

+-------------+----------+---------------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+--------------+--------------+--------------+-------------+-------------+-------------+-------------+--------------------+
|activity_code|subject_id|      timestamp|eating|    accel_x|  accel_y|    accel_z|     gyro_x|     gyro_y|     gyro_z|lead_1_accel_x|lead_1_accel_y|lead_1_accel_z|lead_1_gyro_x|lead_1_gyro_y|lead_1_gyro_z|lead_2_accel_x|lead_2_accel_y|lead_2_accel_z|lead_2_gyro_x|lead_2_gyro_y|lead_2_gyro_z|lead_3_accel_x|lead_3_accel_y|lead_3_accel_z|lead_3_gyro_x|lead_3_gyro_y|lead_3_gyro_z|       device|            features|
+-------------+----------+---------------+------+-----------+---------+-----------+-----------+-----------+-----------+--------------+--------------+--------------+----

In [32]:
cols_orderBy_7 = ['subject_id', 'activity_code', 'device', 'timestamp']
cols_select_7 = ['eating', 'device', 'features']


In [33]:
lpoints_scaled_clean = lpoints_scaled.select(cols_select_7).orderBy(cols_orderBy_7)
lpoints_scaled_clean.show(n)

+------+-------------+--------------------+
|eating|       device|            features|
+------+-------------+--------------------+
|     0|(2,[0],[1.0])|[0.69546612359028...|
|     0|(2,[0],[1.0])|[0.42894076955912...|
|     0|(2,[0],[1.0])|[0.35878297556336...|
|     0|(2,[0],[1.0])|[0.50912109499279...|
|     0|(2,[0],[1.0])|[0.64720949074113...|
+------+-------------+--------------------+
only showing top 5 rows



# 8. Assemble features and device columns

In [34]:
lpoints_scaled_clean.show(5)

+------+-------------+--------------------+
|eating|       device|            features|
+------+-------------+--------------------+
|     0|(2,[0],[1.0])|[0.69546612359028...|
|     0|(2,[0],[1.0])|[0.42894076955912...|
|     0|(2,[0],[1.0])|[0.35878297556336...|
|     0|(2,[0],[1.0])|[0.50912109499279...|
|     0|(2,[0],[1.0])|[0.64720949074113...|
+------+-------------+--------------------+
only showing top 5 rows



In [46]:
input_cols_va_w_device = ['features', 'device']
cols_orderBy_8 = ['device', 'features_w_device']


In [47]:
va_w_device = VectorAssembler(outputCol="features_w_device", inputCols=input_cols_va_w_device, handleInvalid='skip')
lpoints_w_device = va_w_device.transform(lpoints_scaled_clean)
lpoints_w_device_ordered = lpoints_w_device.orderBy(cols_orderBy_8).select('features_w_device', 'eating')\
                                            .withColumnRenamed("eating", "label")\
                                            .withColumnRenamed("features_w_device", "features")

In [48]:
lpoints_w_device_ordered.show(n)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[-8.9744994255888...|    0|
|[-8.0677067776661...|    0|
|[-7.7097425602733...|    0|
|[-7.4654059199349...|    0|
|[-7.3354397099700...|    0|
+--------------------+-----+
only showing top 5 rows



# 9. Divide dataset into training and validation set

In [52]:
#Divide the dataset into training and vaildation sets.
splits = lpoints_w_device_ordered.randomSplit([0.8, 0.2], seed=1)

#cache() : the algorithm is interative and training and data sets are going to be reused many times.
df_train = splits[0].cache()
df_valid = splits[1].cache()

In [53]:
df_train.show(n)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[-3.1091863058572...|    0|
|[-3.0893114309134...|    0|
|[-3.0384874887138...|    0|
|[-3.0241532585882...|    0|
|[-3.0091802692393...|    0|
+--------------------+-----+
only showing top 5 rows



In [54]:
df_valid.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[-3.0236777378331...|    0|
|[-2.6475242601134...|    0|
|[-2.6327949455794...|    0|
|[-2.6230195636882...|    0|
|[-2.5604235255809...|    0|
+--------------------+-----+
only showing top 5 rows



# 10. Apply logistic regression

In [55]:
lr = LogisticRegression(fitIntercept=True)
bceval = BinaryClassificationEvaluator()

In [56]:
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(n_fold)
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter)\
.addGrid(lr.regParam, reg_params).build()
#setEstimatorParamMaps() takes ParamGridBuilder().
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(df_train)

In [57]:
print(cvmodel.bestModel.coefficients, end='\n\n')
print(cvmodel.bestModel.intercept, end='\n\n')
print(cvmodel.bestModel.getMaxIter(), end='\n\n')
print(cvmodel.bestModel.getRegParam(), end='\n\n')

[-0.1420104292974205,-0.028754637780759415,0.03205202527762005,0.016504085688310938,-0.02658310800532583,-0.01168053162195633,-0.04933311128935741,-0.01945816686196624,0.009915791784279102,0.0024073506195827913,-0.0016158406727496992,-0.0021049854223065398,-0.05218408383305372,-0.019138349361056406,0.012156846026440776,0.0047212330188986075,-0.00533516513410685,-0.003013190235412855,-0.13697195909181054,-0.030898560285575177,0.03792992303560347,0.002467125150825503,0.017233916734098045,-0.023384387467520428,-0.0623642538810752,0.033567559727533704]
-1.268723597413594
100
0.001


In [58]:
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(df_valid))

0.6112212064143737

In [None]:
sc.stop()