# Spark - Logistic Regression

**Imports**

In [34]:
import findspark
findspark.init('/home/sedat/spark-3.3.2-bin-hadoop3')
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

**Create spark session and read data**

In [4]:
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [6]:
df = spark.read.format('libsvm').load('sample_libsvm_data.txt')

23/03/22 18:24:22 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [7]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



**Creating logistic regression mo**del

In [17]:
log_model = LogisticRegression()

In [19]:
logModel = log_model.fit(df)

**Model summary**

In [20]:
log_summary = logModel.summary

In [23]:
log_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [24]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

**Split data (train and test)**

In [25]:
train_df, test_df = df.randomSplit([0.7, 0.3])

**Final Model**

In [27]:
final_model = LogisticRegression()

In [29]:
final_model = final_model.fit(train_df)

In [30]:
prediction_and_label = final_model.evaluate(test_df)

In [32]:
prediction_and_label.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[19.6193042142768...|[0.99999999698390...|       0.0|
|  0.0|(692,[122,123,124...|[15.8537324988603...|[0.99999986973990...|       0.0|
|  0.0|(692,[123,124,125...|[29.1591611096007...|[0.99999999999978...|       0.0|
|  0.0|(692,[124,125,126...|[20.1052937178151...|[0.99999999814483...|       0.0|
|  0.0|(692,[125,126,127...|[21.4422487291871...|[0.99999999951275...|       0.0|
|  0.0|(692,[126,127,128...|[17.3117052567317...|[0.99999996968748...|       0.0|
|  0.0|(692,[151,152,153...|[22.2137945481152...|[0.99999999977474...|       0.0|
|  0.0|(692,[152,153,154...|[34.7252161409324...|[0.99999999999999...|       0.0|
|  0.0|(692,[153,154,155...|[7.44128003893592...|[0.99941380988983...|       0.0|
|  0.0|(692,[155

**Model evaluation**

In [35]:
my_eval = BinaryClassificationEvaluator()

In [38]:
final_eval_roc = my_eval.evaluate(prediction_and_label.predictions)

In [39]:
final_eval_roc

1.0

This result is not realistic. For the this example, our data is perfect so we get roc auc score 1.0!