In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [0]:
!wget -q www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz

In [0]:
!pip install -q findspark

In [0]:
!tar -xvf spark-2.4.4-bin-hadoop2.7.tgz

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
import pyspark

In [0]:
from pyspark.sql import functions as F

In [0]:
df = spark.read.csv('iris.csv', inferSchema=True, header=True).withColumnRenamed('sepal.length', 'sepal_length').withColumnRenamed('sepal.width', 'sepal_width').withColumnRenamed('petal.length', 'petal_length').withColumnRenamed('petal.width', 'petal_width')

In [133]:
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.6|        3.4|         1.4|        0.3| Setosa|
|         5.0|        3.4|         1.5|        0.2| Setosa|
|         4.4|        2.9|         1.4|        0.2| Setosa|
|         4.9|        3.1|         1.5|        0.1| Setosa|
|         5.4|        3.7|         1.5|        0.2| Setosa|
|         4.8|        3.4|         1.6|        0.2| Setosa|
|         4.8|        3.0|         1.4|        0.1| Setosa|
|         4.3|        3.0|         1.1| 

Здесь нужно сделать вектора со всеми параметрами в одном столбце (чтоб вмечто 4 читать 1)

In [0]:
feature_cols = df.columns[:-1]
transformer = pyspark.ml.feature.VectorAssembler(inputCols=feature_cols, outputCol='4in1')
df1 = transformer.transform(df)

In [148]:
df1.show()

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|             4in1|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| Setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| Setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| Setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2| Setosa|[4.4,2.9,1.4,0.2]|
|         4.9|        3.1|         1.5|        0.1| Setosa|[4.9,

Убираем уже ненужные столбццы

In [150]:
df1 = df1.select(['4in1', 'variety'])
df1.show()

+-----------------+-------+
|             4in1|variety|
+-----------------+-------+
|[5.1,3.5,1.4,0.2]| Setosa|
|[4.9,3.0,1.4,0.2]| Setosa|
|[4.7,3.2,1.3,0.2]| Setosa|
|[4.6,3.1,1.5,0.2]| Setosa|
|[5.0,3.6,1.4,0.2]| Setosa|
|[5.4,3.9,1.7,0.4]| Setosa|
|[4.6,3.4,1.4,0.3]| Setosa|
|[5.0,3.4,1.5,0.2]| Setosa|
|[4.4,2.9,1.4,0.2]| Setosa|
|[4.9,3.1,1.5,0.1]| Setosa|
|[5.4,3.7,1.5,0.2]| Setosa|
|[4.8,3.4,1.6,0.2]| Setosa|
|[4.8,3.0,1.4,0.1]| Setosa|
|[4.3,3.0,1.1,0.1]| Setosa|
|[5.8,4.0,1.2,0.2]| Setosa|
|[5.7,4.4,1.5,0.4]| Setosa|
|[5.4,3.9,1.3,0.4]| Setosa|
|[5.1,3.5,1.4,0.3]| Setosa|
|[5.7,3.8,1.7,0.3]| Setosa|
|[5.1,3.8,1.5,0.3]| Setosa|
+-----------------+-------+
only showing top 20 rows



Меняем текст в названии цветка на цифровое значение и оставляет только нужные значения


In [241]:
variety_to_num = pyspark.ml.feature.StringIndexer(inputCol='variety', outputCol='labels').fit(data)
df2 = variety_to_num.transform(df1)
df3 = data.select(['features', 'label'])
df3.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  2.0|
|[4.9,3.0,1.4,0.2]|  2.0|
|[4.7,3.2,1.3,0.2]|  2.0|
|[4.6,3.1,1.5,0.2]|  2.0|
|[5.0,3.6,1.4,0.2]|  2.0|
+-----------------+-----+
only showing top 5 rows



Разбиваем на тестовый и тренировочный и пишем модель

In [0]:
train, test = data.randomSplit([0.8, 0.2])
lr = pyspark.ml.classification.LogisticRegression(maxIter=10)
model = lr.fit(train)

Делаем предсказание

In [243]:
predict = model.transform(test)
predict.show()

+-----------------+----------+-----+--------------------+--------------------+----------+
|         features|   variety|label|       rawPrediction|         probability|prediction|
+-----------------+----------+-----+--------------------+--------------------+----------+
|[4.4,3.2,1.3,0.2]|    Setosa|  2.0|[-14.379588841225...|[4.40618809199734...|       2.0|
|[4.6,3.1,1.5,0.2]|    Setosa|  2.0|[-13.047392729976...|[1.50901368969699...|       2.0|
|[4.6,3.2,1.4,0.2]|    Setosa|  2.0|[-13.962795914238...|[1.66233028677295...|       2.0|
|[4.7,3.2,1.3,0.2]|    Setosa|  2.0|[-14.087883338089...|[1.55175382811232...|       2.0|
|[4.9,2.5,4.5,1.7]| Virginica|  0.0|[7.45475545546086...|[0.93772434746212...|       0.0|
|[4.9,3.0,1.4,0.2]|    Setosa|  2.0|[-12.284929225703...|[1.81554775573664...|       2.0|
|[5.0,2.3,3.3,1.0]|Versicolor|  1.0|[1.89188273284863...|[0.03213504687866...|       1.0|
|[5.0,3.4,1.6,0.4]|    Setosa|  2.0|[-13.264400395101...|[8.81087943461337...|       2.0|
|[5.0,3.5,

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)

In [246]:
print("Точноть = %g" % accuracy)

Точноть = 0.571429
