<a href="https://colab.research.google.com/github/ChetanKnowIt/pyspark-colab/blob/main/spark_5_ML_DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=908ebfe16034d9e0f0ab2cf52f5dd2c6c2dc3cbdc2754140aaa024e8ca93a287
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier  
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
spark = SparkSession.builder.master("local")\
.appName("dl")\
.getOrCreate()

In [None]:
sc = spark.sparkContext

In [None]:
df = spark.read.options(header=True,inferSchema=True).csv("data_banknote_authentication.csv")

In [None]:
df.show(3)

+---------+---------+---------+---------+------+
|feature_1|feature_2|feature_3|feature_4|Class |
+---------+---------+---------+---------+------+
|   3.6216|   8.6661|  -2.8073| -0.44699|     0|
|   4.5459|   8.1674|  -2.4586|  -1.4621|     0|
|    3.866|  -2.6383|   1.9242|  0.10645|     0|
+---------+---------+---------+---------+------+
only showing top 3 rows



In [None]:
df.printSchema()

root
 |-- feature_1: string (nullable = true)
 |-- feature_2: string (nullable = true)
 |-- feature_3: string (nullable = true)
 |-- feature_4: double (nullable = true)
 |-- Class : integer (nullable = true)



In [None]:
for col in df.columns:
  df = df.withColumn(col,df[col].cast('double'))

In [None]:
df.printSchema()

root
 |-- feature_1: double (nullable = true)
 |-- feature_2: double (nullable = true)
 |-- feature_3: double (nullable = true)
 |-- feature_4: double (nullable = true)
 |-- Class : double (nullable = true)



In [None]:
#input column
input_cols = df.columns[:-1]

In [None]:
df.select(input_cols)

DataFrame[feature_1: double, feature_2: double, feature_3: double, feature_4: double]

In [None]:
df.select(input_cols).show(3)

+---------+---------+---------+---------+
|feature_1|feature_2|feature_3|feature_4|
+---------+---------+---------+---------+
|   3.6216|   8.6661|  -2.8073| -0.44699|
|   4.5459|   8.1674|  -2.4586|  -1.4621|
|    3.866|  -2.6383|   1.9242|  0.10645|
+---------+---------+---------+---------+
only showing top 3 rows



In [None]:
from pyspark.ml.feature import Imputer, MinMaxScaler
imputed_col = [ 'f_{}'.format(i+1) for i in range(4)]

In [None]:
imputed_col

['f_1', 'f_2', 'f_3', 'f_4']

In [None]:
model = Imputer(strategy='mean',
                missingValue=None,
                inputCols=input_cols,
                outputCols=imputed_col).fit(df) 

In [None]:
impute_data = model.transform(df)

In [None]:
impute_data.show()

+---------+---------+---------+---------+------+------------------+------------------+--------+--------+
|feature_1|feature_2|feature_3|feature_4|Class |               f_1|               f_2|     f_3|     f_4|
+---------+---------+---------+---------+------+------------------+------------------+--------+--------+
|   3.6216|   8.6661|  -2.8073| -0.44699|   0.0|            3.6216|            8.6661| -2.8073|-0.44699|
|   4.5459|   8.1674|  -2.4586|  -1.4621|   0.0|            4.5459|            8.1674| -2.4586| -1.4621|
|    3.866|  -2.6383|   1.9242|  0.10645|   0.0|             3.866|           -2.6383|  1.9242| 0.10645|
|   3.4566|   9.5228|  -4.0112|  -3.5944|   0.0|            3.4566|            9.5228| -4.0112| -3.5944|
|  0.32924|  -4.4552|   4.5718|  -0.9888|   0.0|           0.32924|           -4.4552|  4.5718| -0.9888|
|     null|   9.6718|  -3.9606|  -3.1625|   0.0|0.4308653338439095|            9.6718| -3.9606| -3.1625|
|   3.5912|   3.0129|  0.72888|  0.56421|   0.0|       

In [None]:
assemble = VectorAssembler(inputCols = imputed_col, outputCol = 'assembled_features')

In [None]:
a_data = assemble.transform(impute_data)

In [None]:
a_data.show(10)

+---------+---------+---------+---------+------+------------------+-------+--------+--------+--------------------+
|feature_1|feature_2|feature_3|feature_4|Class |               f_1|    f_2|     f_3|     f_4|  assembled_features|
+---------+---------+---------+---------+------+------------------+-------+--------+--------+--------------------+
|   3.6216|   8.6661|  -2.8073| -0.44699|   0.0|            3.6216| 8.6661| -2.8073|-0.44699|[3.6216,8.6661,-2...|
|   4.5459|   8.1674|  -2.4586|  -1.4621|   0.0|            4.5459| 8.1674| -2.4586| -1.4621|[4.5459,8.1674,-2...|
|    3.866|  -2.6383|   1.9242|  0.10645|   0.0|             3.866|-2.6383|  1.9242| 0.10645|[3.866,-2.6383,1....|
|   3.4566|   9.5228|  -4.0112|  -3.5944|   0.0|            3.4566| 9.5228| -4.0112| -3.5944|[3.4566,9.5228,-4...|
|  0.32924|  -4.4552|   4.5718|  -0.9888|   0.0|           0.32924|-4.4552|  4.5718| -0.9888|[0.32924,-4.4552,...|
|     null|   9.6718|  -3.9606|  -3.1625|   0.0|0.4308653338439095| 9.6718| -3.9

In [None]:
scaler = MinMaxScaler(min=0.0, max=1.0,
                      inputCol='assembled_features',
                      outputCol='features')

In [None]:
s_data = scaler.fit(a_data).transform(a_data)

In [None]:
s_data.show(10)

+---------+---------+---------+---------+------+------------------+-------+--------+--------+--------------------+--------------------+
|feature_1|feature_2|feature_3|feature_4|Class |               f_1|    f_2|     f_3|     f_4|  assembled_features|            features|
+---------+---------+---------+---------+------+------------------+-------+--------+--------+--------------------+--------------------+
|   3.6216|   8.6661|  -2.8073| -0.44699|   0.0|            3.6216| 8.6661| -2.8073|-0.44699|[3.6216,8.6661,-2...|[0.76900388695382...|
|   4.5459|   8.1674|  -2.4586|  -1.4621|   0.0|            4.5459| 8.1674| -2.4586| -1.4621|[4.5459,8.1674,-2...|[0.83565901535310...|
|    3.866|  -2.6383|   1.9242|  0.10645|   0.0|             3.866|-2.6383|  1.9242| 0.10645|[3.866,-2.6383,1....|[0.78662859038429...|
|   3.4566|   9.5228|  -4.0112|  -3.5944|   0.0|            3.4566| 9.5228| -4.0112| -3.5944|[3.4566,9.5228,-4...|[0.75710504871312...|
|  0.32924|  -4.4552|   4.5718|  -0.9888|   0.0|

In [None]:
s_data = s_data.withColumnRenamed('Class ', 'label')

In [None]:
s_data

DataFrame[feature_1: double, feature_2: double, feature_3: double, feature_4: double, label: double, f_1: double, f_2: double, f_3: double, f_4: double, assembled_features: vector, features: vector]

In [None]:
train_df, test_df = s_data.select('label','features').randomSplit([0.75,0.25], seed = 0)

In [None]:
train_df.count(),test_df.count()

(1013, 359)

In [None]:
s_data.count() * 0.75, s_data.count() * 0.25

(1029.0, 343.0)

In [None]:
mlpc = MultilayerPerceptronClassifier(featuresCol = 'features',
                                      labelCol = 'label',
                                      layers = [4,32,2],
                                      maxIter = 500,
                                      blockSize = 8, 
                                      seed = 0,
                                      solver = 'gd')

In [None]:
ann = mlpc.fit(train_df)

In [None]:
pred = ann.transform(test_df)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
eval = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy')

In [None]:
print('Accuracy: ', eval.evaluate(pred)*100)

Accuracy:  83.56545961002786
