In [47]:
from pyspark.sql import SparkSession
import numpy as np
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('LR').getOrCreate()

In [7]:
data = spark.read.format('libsvm').load('/home/sai/ex/ML/Log/sample_libsvm_data.txt')

In [8]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [14]:
#Split data
tr_data, ts_data = data.randomSplit([0.8,0.2])

In [15]:
cls = LogisticRegression()

In [16]:
model = cls.fit(tr_data)

In [17]:
summary_ = model.summary

In [18]:
summary_.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [19]:
summary_.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[21.5749925205941...|[0.99999999957332...|       0.0|
|  0.0|(692,[98,99,100,1...|[29.8250065929515...|[0.99999999999988...|       0.0|
|  0.0|(692,[100,101,102...|[19.1943368105082...|[0.99999999538676...|       0.0|
|  0.0|(692,[121,122,123...|[22.5776751718222...|[0.99999999984345...|       0.0|
|  0.0|(692,[122,123,124...|[19.7073348559742...|[0.99999999723806...|       0.0|
|  0.0|(692,[122,123,148...|[20.5397585586786...|[0.99999999879857...|       0.0|
|  0.0|(692,[123,124,125...|[26.0194291472572...|[0.99999999999498...|       0.0|
|  0.0|(692,[123,124,125...|[30.1981782262855...|[0.99999999999992...|       0.0|
|  0.0|(692,[124,125,126...|[24.4077147799599...|[0.99999999997488...|       0.0|
|  0.0|(692,[124

In [26]:
pred = model.evaluate(ts_data)

In [28]:
pred.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[123,124,125...|[22.5797751085149...|[0.99999999984378...|       0.0|
|  0.0|(692,[124,125,126...|[30.6159567744561...|[0.99999999999994...|       0.0|
|  0.0|(692,[124,125,126...|[25.1933770889735...|[0.99999999998855...|       0.0|
|  0.0|(692,[126,127,128...|[15.4626736332726...|[0.99999980740452...|       0.0|
|  0.0|(692,[127,128,129...|[14.7087661115097...|[0.99999959067915...|       0.0|
|  0.0|(692,[128,129,130...|[13.6517897293408...|[0.99999882211595...|       0.0|
|  0.0|(692,[129,130,131...|[12.6739728325224...|[0.99999686842963...|       0.0|
|  0.0|(692,[150,151,152...|[27.1519711378645...|[0.99999999999838...|       0.0|
|  0.0|(692,[153,154,155...|[27.7802704814869...|[0.99999999999913...|       0.0|
|  1.0|(692,[99,

In [23]:
pred.areaUnderROC

1.0

In [30]:
eval_m = BinaryClassificationEvaluator()

In [31]:
roc = eval_m.evaluate(pred.predictions)

In [32]:
roc

1.0

# Titanic dataset

In [35]:
data = spark.read.csv('/home/sai/ex/ML/Log/titanic.csv', inferSchema=True, header=True)

In [36]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [37]:
#select columns
cols = data.select(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

In [38]:
df_na = cols.na.drop()

In [40]:
gender_ind = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVector')

In [41]:
embark_ind = StringIndexer(inputCol='Embarked', outputCol='EmbIndex')
embark_encoder = OneHotEncoder(inputCol='EmbIndex', outputCol='EmbVector')

In [42]:
assemble = VectorAssembler(inputCols= ['Pclass', 'SexVector','EmbVector', 'Age', 'SibSp', 'Parch', 'Fare'], outputCol='features')

In [45]:
#pipeline
log_reg = LogisticRegression(featuresCol='features', labelCol='Survived')

In [48]:
pipeline = Pipeline(stages=[gender_ind, embark_ind,
                            gender_encoder, embark_encoder,
                            assemble, log_reg])

In [50]:
model = pipeline.fit(tr_data)

In [51]:
pred = model.transform(ts_data)

In [56]:
eval_m = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [57]:
roc = eval_m.evaluate(pred)

In [58]:
roc

0.7710253456221198