In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,DoubleType

In [2]:
spark = SparkSession.builder.appName("PARKINSON").getOrCreate()

In [3]:
schema1 = StructType([
    StructField("MDVP:Fo(Hz)", DoubleType(), True),
    StructField("MDVP:Fhi(Hz)", DoubleType(), True),
    StructField("MDVP:Flo(Hz)", DoubleType(), True),
    StructField("MDVP:Jitter(%)", DoubleType(), True),
    StructField("MDVP:Jitter(Abs)", DoubleType(), True),
    StructField("MDVP:RAP", DoubleType(), True),
    StructField("MDVP:PPQ", DoubleType(), True),
    StructField("Jitter:DDP", DoubleType(), True),
    StructField("MDVP:Shimmer", DoubleType(), True),
    StructField("MDVP:Shimmer(dB)", DoubleType(), True),
    StructField("Shimmer:APQ3", DoubleType(), True),
    StructField("Shimmer:APQ5", DoubleType(), True),
    StructField("Shimmer:DDA", DoubleType(), True),
    StructField("NHR", DoubleType(), True),
    StructField("HNR", DoubleType(), True),
    StructField("RPDE", DoubleType(), True),
    StructField("DFA", DoubleType(), True),
    StructField("spread1", DoubleType(), True),
    StructField("spread2", DoubleType(), True),
    StructField("D2", DoubleType(), True),
    StructField("PPE", DoubleType(), True)

])

In [4]:
schema2 = StructType([
    StructField("MDVP:Fo(Hz)", DoubleType(), True),
    StructField("MDVP:Fhi(Hz)", DoubleType(), True),
    StructField("MDVP:Flo(Hz)", DoubleType(), True),
    StructField("MDVP:Jitter(%)", DoubleType(), True),
    StructField("MDVP:Jitter(Abs)", DoubleType(), True),
    StructField("MDVP:RAP", DoubleType(), True),
    StructField("MDVP:PPQ", DoubleType(), True),
    StructField("Jitter:DDP", DoubleType(), True),
    StructField("MDVP:Shimmer", DoubleType(), True),
    StructField("MDVP:Shimmer(dB)", DoubleType(), True),
    StructField("Shimmer:APQ3", DoubleType(), True),
    StructField("Shimmer:APQ5", DoubleType(), True),
    StructField("Shimmer:DDA", DoubleType(), True),
    StructField("NHR", DoubleType(), True),
    StructField("HNR", DoubleType(), True),
    StructField("status", IntegerType(), True),
    StructField("RPDE", DoubleType(), True),
    StructField("DFA", DoubleType(), True),
    StructField("spread1", DoubleType(), True),
    StructField("spread2", DoubleType(), True),
    StructField("D2", DoubleType(), True),
    StructField("PPE", DoubleType(), True)

])

In [9]:
train_df = spark.read.csv('train.csv', header=True)
test_df = spark.read.csv('test.csv', header=True)

In [13]:
test_df

DataFrame[status: string]

In [11]:
train_df.limit(5).toPandas()

Unnamed: 0,MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [12]:
test_df.limit(5).toPandas()

Unnamed: 0,status
0,1
1,1
2,1
3,1
4,1


In [24]:
test_df.printSchema()

root
 |-- MDVP:Fo(Hz): double (nullable = true)
 |-- MDVP:Fhi(Hz): double (nullable = true)
 |-- MDVP:Flo(Hz): double (nullable = true)
 |-- MDVP:Jitter(%): double (nullable = true)
 |-- MDVP:Jitter(Abs): double (nullable = true)
 |-- MDVP:RAP: double (nullable = true)
 |-- MDVP:PPQ: double (nullable = true)
 |-- Jitter:DDP: double (nullable = true)
 |-- MDVP:Shimmer: double (nullable = true)
 |-- MDVP:Shimmer(dB): double (nullable = true)
 |-- Shimmer:APQ3: double (nullable = true)
 |-- Shimmer:APQ5: double (nullable = true)
 |-- Shimmer:DDA: double (nullable = true)
 |-- NHR: double (nullable = true)
 |-- HNR: double (nullable = true)
 |-- RPDE: double (nullable = true)
 |-- DFA: double (nullable = true)
 |-- spread1: double (nullable = true)
 |-- spread2: double (nullable = true)
 |-- D2: double (nullable = true)
 |-- PPE: double (nullable = true)



In [25]:
#Unimos las variables numéricas en una sola columna
continuous_variables = ['MDVP:Fo(Hz)','MDVP:Fhi(Hz)','MDVP:Flo(Hz)','MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ','Jitter:DDP','MDVP:Shimmer','MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','Shimmer:DDA','NHR','HNR','RPDE','DFA','spread1','spread2','D2','PPE']
assembler = VectorAssembler(
    inputCols=continuous_variables,
    outputCol='features'
)
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

In [26]:
#Finalmente, creamos la columna Label como status
indexer = StringIndexer(inputCol='status', outputCol='label')
train_df = indexer.fit(train_df).transform(train_df)
test_df = indexer.fit(test_df).transform(test_df)
train_df.limit(10).toPandas()['label']

IllegalArgumentException: 'Field "status" does not exist.\nAvailable fields: MDVP:Fo(Hz), MDVP:Fhi(Hz), MDVP:Flo(Hz), MDVP:Jitter(%), MDVP:Jitter(Abs), MDVP:RAP, MDVP:PPQ, Jitter:DDP, MDVP:Shimmer, MDVP:Shimmer(dB), Shimmer:APQ3, Shimmer:APQ5, Shimmer:DDA, NHR, HNR, RPDE, DFA, spread1, spread2, D2, PPE, features'