<a href="https://colab.research.google.com/github/Sayandeep27/Pyspark/blob/main/pyspark_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=e153c8ed66af103db906a15ff7d074ec34aa29418e6d443dc960656cf848f4bd
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
spark = SparkSession.builder.appName("IrisClassifier").getOrCreate()


In [None]:
data = [
    (5.1, 3.5, 1.4, 0.2, "setosa"),
    (4.9, 3.0, 1.4, 0.2, "setosa"),
    (4.7, 3.2, 1.3, 0.2, "setosa"),
    (4.6, 3.1, 1.5, 0.2, "setosa"),
    (5.0, 3.6, 1.4, 0.2, "setosa"),
    (7.0, 3.2, 4.7, 1.4, "versicolor"),
    (6.4, 3.2, 4.5, 1.5, "versicolor"),
    (6.9, 3.1, 4.9, 1.5, "versicolor"),
    (5.5, 2.3, 4.0, 1.3, "versicolor"),
    (6.5, 2.8, 4.6, 1.5, "versicolor"),
    (6.3, 3.3, 6.0, 2.5, "virginica"),
    (5.8, 2.7, 5.1, 1.9, "virginica"),
    (7.1, 3.0, 5.9, 2.1, "virginica"),
    (6.3, 2.9, 5.6, 1.8, "virginica"),
    (6.5, 3.0, 5.8, 2.2, "virginica")
]

columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
df = spark.createDataFrame(data, columns)


In [None]:
# Convert species to numerical values
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="species", outputCol="label")
df = indexer.fit(df).transform(df)

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")
df = assembler.transform(df)

df = df.select("features", "label")
df.show()


+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[7.0,3.2,4.7,1.4]|  1.0|
|[6.4,3.2,4.5,1.5]|  1.0|
|[6.9,3.1,4.9,1.5]|  1.0|
|[5.5,2.3,4.0,1.3]|  1.0|
|[6.5,2.8,4.6,1.5]|  1.0|
|[6.3,3.3,6.0,2.5]|  2.0|
|[5.8,2.7,5.1,1.9]|  2.0|
|[7.1,3.0,5.9,2.1]|  2.0|
|[6.3,2.9,5.6,1.8]|  2.0|
|[6.5,3.0,5.8,2.2]|  2.0|
+-----------------+-----+



In [None]:
train_df, test_df = df.randomSplit([0.8, 0.2])


In [None]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
model = dt.fit(train_df)


In [None]:
predictions = model.transform(test_df)
predictions.show()


+-----------------+-----+-------------+-------------+----------+
|         features|label|rawPrediction|  probability|prediction|
+-----------------+-----+-------------+-------------+----------+
|[4.7,3.2,1.3,0.2]|  0.0|[2.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.9,3.0,1.4,0.2]|  0.0|[2.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[5.1,3.5,1.4,0.2]|  0.0|[2.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[6.9,3.1,4.9,1.5]|  1.0|[0.0,4.0,0.0]|[0.0,1.0,0.0]|       1.0|
+-----------------+-----+-------------+-------------+----------+



In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0
