In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 52.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=25787b27df3de1cfdf7541020be7d991d1652b1a44e576dc510bb0dc6dae81cb
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [24]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('random').getOrCreate()

In [25]:
df = spark.read.csv('/content/dog_food.csv', inferSchema=True, header=True)

In [28]:
df.printSchema()
df.show(10)

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
+---+---+----+---+-------+
only showing top 10 rows



In [60]:
x = df.select(['D','Spoiled'])
y = x.filter(x['Spoiled'] == 1)
y.count()

140

In [52]:
print(f"Total rows: {df.count()}")
print("------------------------------")
print(f" Spoiled with substance A: {df.select(['A','Spoiled']).filter(df['Spoiled'] == 1).count()}")
print(f" Spoiled with substance B: {df.select(['B','Spoiled']).filter(df['Spoiled'] == 1).count()}")
print(f" Spoiled with substance C: {df.select(['C','Spoiled']).filter(df['Spoiled'] == 1).count()}")
print(f" Spoiled with substance D: {df.select(['D','Spoiled']).filter(df['Spoiled'] == 1).count()}")

Total rows: 490
------------------------------
 Spoiled with substance A: 140
 Spoiled with substance B: 140
 Spoiled with substance C: 140
 Spoiled with substance D: 140


In [29]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features')
output = assembler.transform(df)
final_data = output.select(['features','Spoiled'])
final_data.show(5)

+------------------+-------+
|          features|Spoiled|
+------------------+-------+
|[4.0,2.0,12.0,3.0]|    1.0|
|[5.0,6.0,12.0,7.0]|    1.0|
|[6.0,2.0,13.0,6.0]|    1.0|
|[4.0,2.0,12.0,1.0]|    1.0|
|[4.0,2.0,12.0,3.0]|    1.0|
+------------------+-------+
only showing top 5 rows



In [30]:
train, test = final_data.randomSplit([0.7,0.3])

In [33]:
from pyspark.ml.classification import RandomForestClassifier

rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')
rfc_model = rfc.fit(train)
rfc_preds = rfc_model.transform(test)

In [72]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

bin_eval = BinaryClassificationEvaluator(labelCol='Spoiled')
acc_eval = MulticlassClassificationEvaluator(labelCol='Spoiled', metricName='accuracy')
print(f"RFC: {bin_eval.evaluate(rfc_preds)}\n")
print(f"RFC accuracy: {acc_eval.evaluate(rfc_preds)}\n")
print(f"Feature Importance: A: {round(rfc_model.featureImportances [0],2)*100}% B: {round(rfc_model.featureImportances [1])*100}% C: {round(rfc_model.featureImportances [2])*100}% D: {round(rfc_model.featureImportances [3])*100}%")

RFC: 0.9708074534161488

RFC accuracy: 0.9681528662420382

Feature Importance: A: 2.0% B: 0% C: 100% D: 0%


In [37]:
results = rfc_model.transform(test)
results.show()

+-------------------+-------+--------------------+--------------------+----------+
|           features|Spoiled|       rawPrediction|         probability|prediction|
+-------------------+-------+--------------------+--------------------+----------+
| [1.0,1.0,12.0,4.0]|    1.0|[0.85714285714285...|[0.04285714285714...|       1.0|
|  [1.0,4.0,9.0,3.0]|    0.0|[19.8258897987146...|[0.99129448993573...|       0.0|
| [1.0,5.0,8.0,10.0]|    0.0|[17.2287492132528...|[0.86143746066264...|       0.0|
|  [1.0,6.0,8.0,1.0]|    0.0|[19.9633238646578...|[0.99816619323289...|       0.0|
|  [1.0,7.0,7.0,2.0]|    0.0|[19.9468491313528...|[0.99734245656764...|       0.0|
|  [1.0,7.0,8.0,4.0]|    0.0|[19.9378401223438...|[0.99689200611719...|       0.0|
|[1.0,7.0,11.0,10.0]|    1.0|[0.29090909090909...|[0.01454545454545...|       1.0|
|  [1.0,8.0,8.0,6.0]|    0.0|[19.9444626389000...|[0.99722313194500...|       0.0|
| [1.0,8.0,12.0,1.0]|    1.0|          [0.0,20.0]|           [0.0,1.0]|       1.0|
|  [