In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML project').getOrCreate()

In [3]:
# load data 
df = spark.read.csv('./bank.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [4]:
df.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|
| 56|     admin.| married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|
| 41| technician| married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|
| 55|   services| married|secondary|     no|   2476|    yes|  no|unknown|  5|  may|     579|       1|   -1|       0| unknown|    yes|
| 54|     admin.| married| tertiary|     no|    184|     no|  

In [5]:
# Show Dataset in DataFrame
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
age,59,56,41,55,54
job,admin.,admin.,technician,services,admin.
marital,married,married,married,married,married
education,secondary,secondary,secondary,secondary,tertiary
default,no,no,no,no,no
balance,2343,45,1270,2476,184
housing,yes,no,yes,yes,no
loan,no,no,no,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


In [6]:
# Data grouping by class 
class_name = df.columns[len(df.columns)-1]
df.groupby(class_name).count().toPandas()

Unnamed: 0,deposit,count
0,no,5873
1,yes,5289


In [7]:
# convert string to numeric 
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()
indexer.setInputCol(class_name).setOutputCol("label")
df_new = indexer.fit(df).transform(df)


In [8]:

indexer.setInputCol("job").setOutputCol("job_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("marital").setOutputCol("marital_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("education").setOutputCol("education_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("default").setOutputCol("default_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("housing").setOutputCol("housing_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("loan").setOutputCol("loan_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("contact").setOutputCol("contact_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("poutcome").setOutputCol("poutcome_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("deposit").setOutputCol("deposit_new")
df_new = indexer.fit(df_new).transform(df_new)
indexer.setInputCol("month").setOutputCol("month_new")
df_new = indexer.fit(df_new).transform(df_new)

In [9]:
df_new.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----+-------+-----------+-------------+-----------+-----------+--------+-----------+------------+-----------+---------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|label|job_new|marital_new|education_new|default_new|housing_new|loan_new|contact_new|poutcome_new|deposit_new|month_new|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----+-------+-----------+-------------+-----------+-----------+--------+-----------+------------+-----------+---------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|  1.0|    3.0|        0.0|          0.0|        0.0|        1.0|     0.0|        1.0|         0.0|       

In [11]:
# features and class 
class_name = 'label'
feature_names = ['job_new','marital_new','marital_new','default_new','housing_new','loan_new','contact_new','poutcome_new','deposit_new','month_new','balance','day','duration','campaign','pdays','previous']


In [12]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler()
assembler.setInputCols(feature_names).setOutputCol('features')

transformed_data = assembler.transform(df_new)

In [13]:
transformed_data.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----+-------+-----------+-------------+-----------+-----------+--------+-----------+------------+-----------+---------+--------------------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|label|job_new|marital_new|education_new|default_new|housing_new|loan_new|contact_new|poutcome_new|deposit_new|month_new|            features|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----+-------+-----------+-------------+-----------+-----------+--------+-----------+------------+-----------+---------+--------------------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|  1.0|    3.0|        0.0|          0.0|  

In [14]:
# Split the data
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [15]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol = 'features',labelCol=class_name, maxIter=30)

In [16]:
Model = model.fit(training_data)

In [17]:
# Predict with the test dataset
predictions = Model.transform(test_data)

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'label', metricName = 'accuracy')
print('Logistic Regression Accuracy:', multi_evaluator.evaluate(predictions))


Logistic Regression Accuracy: 1.0


In [24]:
predictions.select("label","prediction").show()

+-----+----------+
|label|prediction|
+-----+----------+
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



In [25]:
predictions.show()

+---+-------------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+-----+-------+-----------+-------------+-----------+-----------+--------+-----------+------------+-----------+---------+--------------------+--------------------+--------------------+----------+
|age|          job|marital|education|default|balance|housing|loan|  contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|label|job_new|marital_new|education_new|default_new|housing_new|loan_new|contact_new|poutcome_new|deposit_new|month_new|            features|       rawPrediction|         probability|prediction|
+---+-------------+-------+---------+-------+-------+-------+----+---------+---+-----+--------+--------+-----+--------+--------+-------+-----+-------+-----------+-------------+-----------+-----------+--------+-----------+------------+-----------+---------+--------------------+--------------------+--------------------+----------+
| 19|  