<a href="https://colab.research.google.com/github/Poznyakova/-7/blob/main/%D0%94%D0%97_%E2%84%9613_%D0%9F%D0%BE%D0%B7%D0%BD%D1%8F%D0%BA%D0%BE%D0%B2%D0%B0_%D0%9E%D0%BB%D1%8C%D0%B3%D0%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [None]:
# загружаем библиотеки
import pandas as pd
import numpy as np

from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, IntegerType, DateType

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession #to define a SparkSession

sc = SparkContext('local') #spark_connection
spark = SparkSession(sc) # open spark session

Загружаем данные

In [None]:
df = spark.read.format("com.databricks.spark.csv").options(sep=",", header=True, quote="").csv('iris.csv')

In [None]:
df.show()

Определяем тип данных

In [None]:
df.printSchema()

Преобразуем датафрейм (преобразуем названия колонок, убираем ковычки, изменяем вид данных)

In [None]:
newColumns = ["sepal_length","sepal_width","petal_length","petal_width","variety"]
df_new = df.toDF(*newColumns)

df_new = df_new.withColumn("sepal_length", regexp_replace(col("sepal_length"), '"', ""))\
    .withColumn("variety", regexp_replace(col("variety"), '"', ""))

df_new = df_new \
    .withColumn('sepal_length', col('sepal_length').cast(DoubleType())) \
    .withColumn('sepal_width', col('sepal_width').cast(DoubleType())) \
    .withColumn('petal_length', col('petal_length').cast(DoubleType())) \
    .withColumn('petal_width', col('petal_width').cast(DoubleType()))

Выводим статистику по датафрейму

In [None]:
df_new.summary().show()

Векторизация признаков. Pipeline

In [None]:
pipeline = Pipeline(stages = 
[
  StringIndexer(inputCol='variety', outputCol='varietyInd'),
  VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol='Features')
]
)

In [None]:
pipelineTrained = pipeline.fit(df_new)

In [None]:
pipelineTrained.transform(df_new).show()

In [None]:
df_features = pipelineTrained.transform(df_new)

Делим датафрейм на обучающую и тестовую выборки (80/20)

In [None]:
train, test = df_features.randomSplit([0.8, 0.2], seed=12345)

In [None]:
train.show()

Создание модели и ее обучение

In [None]:
lr = LogisticRegression(featuresCol = 'Features', labelCol = 'varietyInd')
lrModel = lr.fit(train)

In [None]:
train_res = lrModel.transform(train)
test_res = lrModel.transform(test)

In [None]:
train_res.show()

In [None]:
test_res.show()

MulticlassClassificationEvaluator

In [None]:
ev = MulticlassClassificationEvaluator(labelCol='varietyInd')

In [None]:
print('Точность предсказания на обучающей выборке:', ev.evaluate(test_res)*100, '%')

In [None]:
print('Точность предсказания на обучающей выборке:', ev.evaluate(train_res)*100, '%')