In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession


spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql.types import IntegerType, StringType, ArrayType
from pyspark.sql.functions import monotonically_increasing_id

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls "/content/gdrive/My Drive/Bigdata/final-exercises/data"
DATA_PATH = "/content/gdrive/My Drive/Bigdata/final-exercises/data/iris.csv"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
followers.txt  foodmart.csv  iris.csv  movies.json  mushrooms.csv  users.txt


In [None]:
df = spark.read.load(DATA_PATH, format="csv", header=True, delimiter=",", inferSchema=True)
df.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [None]:
(train, test) = df.randomSplit([0.8, 0.2])

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

from sklearn.metrics import confusion_matrix, precision_score, recall_score

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import StandardScaler

In [None]:
numericCols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")

scale=StandardScaler(inputCol='features',outputCol='standardized')

pipeline = Pipeline(stages=[assembler, scale])

data_transform = pipeline.fit(df)
data_scale_output = data_transform.transform(df)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
silhouette_score=[]
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')
for i in range(2,10):
    
    KMeans_algo=KMeans(featuresCol='standardized', k=i)
    
    KMeans_fit=KMeans_algo.fit(data_scale_output)
    
    output=KMeans_fit.transform(data_scale_output)
    
    score=evaluator.evaluate(output)
    silhouette_score.append(score)
    
    print("Silhouette Score k = {}: {}".format(i, score))

Silhouette Score k = 2: 0.7714149126311811
Silhouette Score k = 3: 0.6797395814522242
Silhouette Score k = 4: 0.5879625797757644
Silhouette Score k = 5: 0.5762515303108666
Silhouette Score k = 6: 0.5174337263422971
Silhouette Score k = 7: 0.5722506895660261
Silhouette Score k = 8: 0.5592844328432806
Silhouette Score k = 9: 0.5765294347682386


In [None]:
KMeans_algo=KMeans(featuresCol='standardized', k=2)
    
KMeans_fit=KMeans_algo.fit(data_scale_output)
    
output=KMeans_fit.transform(data_scale_output)
output.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|        standardized|prediction|
+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|[6.15892840883878...|         1|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|[5.9174018045706,...|         1|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|[5.67587520030241...|         1|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|[5.55511189816831...|         1|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|[6.03816510670469...|         1|
|  6|          5.4|     

In [None]:
print(output.filter('prediction == 0 AND Species == "Iris-versicolor"').count())
print(output.filter('prediction == 0 AND Species == "Iris-setosa"').count())
print(output.filter('prediction == 0 AND Species == "Iris-virginica"').count())

50
0
50


In [None]:
print(output.filter('prediction == 1 AND Species == "Iris-versicolor"').count())
print(output.filter('prediction == 1 AND Species == "Iris-setosa"').count())
print(output.filter('prediction == 1 AND Species == "Iris-virginica"').count())

0
50
0
