Выполните анализ Вами выбранного датасета с помощью двух алгоритмов машинного
обучения в соответствии с индивидуальным вариантом:

Вариант: 1  
Задача регрессии: RandomForest  
Задача бинарной классификации: LogisticRegression

Датасет: 5. Датасет исторических данных по фотоэлектричеству и нагрузке.

In [56]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, IntegerType, DoubleType
import pyspark.sql.functions as F

In [57]:
MAX_MEMORY = '20G'
# Initialize a spark session.
conf = pyspark.SparkConf().setMaster("local[*]") \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set("spark.core.connection.ack.wait.timeout", "3600") \
        .set("spark.executor.memory", MAX_MEMORY) \
        .set("spark.driver.memory", MAX_MEMORY)
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Pyspark guide") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

spark = init_spark()

filename_data = '1.csv'
# Load the main data set into pyspark data frame 
df = spark.read.csv(filename_data, header=True, inferSchema=True, sep=';')
print('Data frame type: ' + str(type(df)))

Data frame type: <class 'pyspark.sql.dataframe.DataFrame'>


In [58]:
# Define column names and data types
columns = [
    ("timestamp", TimestampType()),
    ("site_id", IntegerType()),
    ("period_id", IntegerType()),
    ("actual_consumption", DoubleType()),
    ("actual_pv", DoubleType()),
    ("load_00", DoubleType()),
    ("load_01", DoubleType()),
    ("load_02", DoubleType()),
    ("load_03", DoubleType()),
    ("load_04", DoubleType()),
    ("load_05", DoubleType()),
    ("pv_00", DoubleType()),
    ("pv_01", DoubleType()),
    ("pv_02", DoubleType()),
    ("pv_03", DoubleType()),
    ("pv_04", DoubleType()),
    ("pv_05", DoubleType())
]

# Apply the schema to the PySpark DataFrame
for col, data_type in columns:
    df = df.withColumn(col, df['`{}`'.format(col)].cast(data_type))

# Select specific columns from the DataFrame
selected_columns = ["timestamp", "site_id", "period_id", "actual_consumption", "actual_pv",
                    "load_00", "load_01", "load_02", "load_03", "load_04", "load_05",
                    "pv_00", "pv_01", "pv_02", "pv_03", "pv_04", "pv_05"]

selected_df = df.select(*selected_columns)

# Define conditions for filtering zeros and negative numbers
conditions = (
    (F.col("actual_consumption") > 0) &
    (F.col("actual_pv") > 0) &
    (F.col("load_00") > 0) &
    (F.col("load_01") > 0) &
    (F.col("load_02") > 0) &
    (F.col("load_03") > 0) &
    (F.col("load_04") > 0) &
    (F.col("load_05") > 0) &
    (F.col("pv_00") > 0) &
    (F.col("pv_01") > 0) &
    (F.col("pv_02") > 0) &
    (F.col("pv_03") > 0) &
    (F.col("pv_04") > 0) &
    (F.col("pv_05") > 0)
)

# Apply the filter to the DataFrame
filtered_df = selected_df.filter(conditions)

# Specify the columns where you want to handle outliers (assuming these are numeric columns)
numerical_columns = ["actual_consumption", "actual_pv", "load_00", "load_01", "load_02", "load_03", "load_04", "load_05",
                     "pv_00", "pv_01", "pv_02", "pv_03", "pv_04", "pv_05"]

# Define the lower and upper limits based on 3 standard deviations from the mean
std_dev_limits = {col_name: 3 * filtered_df.agg({col_name: "stddev"}).collect()[0][0] for col_name in numerical_columns}
mean_values = {col_name: filtered_df.agg({col_name: "mean"}).collect()[0][0] for col_name in numerical_columns}

# Filter the data, keeping only the rows where values are within 3 standard deviations from the mean
for col_name in numerical_columns:
    lower_limit = mean_values[col_name] - std_dev_limits[col_name]
    upper_limit = mean_values[col_name] + std_dev_limits[col_name]
    filtered_df = filtered_df.filter((filtered_df[col_name] >= lower_limit) & (filtered_df[col_name] <= upper_limit))
    
    
# Show the resulting DataFrame
filtered_df.show()

+-------------------+-------+---------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
|          timestamp|site_id|period_id|actual_consumption|         actual_pv|           load_00|           load_01|           load_02|           load_03|           load_04|           load_05|              pv_00|              pv_01|              pv_02|             pv_03|             pv_04|              pv_05|
+-------------------+-------+---------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
|2014-07-19 18:45:00|      1|        0| 51.62570299494799| 22.71248932

In [59]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

# Создайте вектор признаков из выбранных колонок
feature_columns = ["site_id", "period_id", "load_00", "load_01", "load_02", "load_03", "load_04", "load_05",
                   "pv_00", "pv_01", "pv_02", "pv_03", "pv_04", "pv_05"]
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features_new")  # Изменено имя колонки
assembled_df = vector_assembler.transform(filtered_df)

# Разделите данные на обучающий и тестовый наборы
(train_data, test_data) = assembled_df.randomSplit([0.8, 0.2], seed=123)

# Инициализируйте модель случайного леса
rf = RandomForestRegressor(featuresCol="features_new", labelCol="actual_consumption")  

# Создайте конвейер для последовательного выполнения шагов
pipeline = Pipeline(stages=[rf])

# Обучите модель на обучающем наборе
model = pipeline.fit(train_data)

# Сделайте прогнозы на тестовом наборе
predictions = model.transform(test_data)

# Оцените модель с использованием метрик регрессии
evaluator = RegressionEvaluator(labelCol="actual_consumption", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 2.5376


In [60]:
import pandas as pd

# Преобразуйте результаты предсказаний в DataFrame
predictions_df = predictions.select("actual_consumption", "prediction").toPandas()

# Выведите первые 20 строк DataFrame
print(predictions_df.head(20))


    actual_consumption  prediction
0            50.719565   53.450827
1            53.116910   53.163483
2            52.925833   53.163483
3            52.271559   53.197487
4            52.951707   53.197487
5            52.280497   53.121900
6            90.469777   89.486880
7            90.177203   89.652281
8            88.147575   88.264002
9            89.307277   88.877156
10           89.937456   89.652281
11           71.385435   70.599884
12           69.325334   60.634874
13           90.764232   92.292888
14           87.251615   89.031593
15           93.231641   91.494730
16           91.196495   93.283110
17           92.045886   92.045176
18           89.515550   84.690037
19           91.700063   92.505897


In [61]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import when

# Создаем новую колонку "label" на основе условия
filtered_df = filtered_df.withColumn("label", when(filtered_df["actual_consumption"] > 0, 1).otherwise(0))

# Примените VectorAssembler и создайте новый DataFrame
assembled_df = VectorAssembler(inputCols=feature_columns, outputCol="features").transform(filtered_df).select("features", "label")

# Разделите данные на обучающий и тестовый наборы
(train_data, test_data) = assembled_df.randomSplit([0.8, 0.2], seed=123)

# Инициализируйте модель Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Создайте конвейер для последовательного выполнения шагов
pipeline = Pipeline(stages=[lr])

# Обучите модель на обучающем наборе
model = pipeline.fit(train_data)

# Сделайте прогнозы на тестовом наборе
predictions = model.transform(test_data)

# Оцените модель с использованием метрик бинарной классификации
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
area_under_roc = evaluator.evaluate(predictions)
print("Area under ROC = %g" % area_under_roc)


Area under ROC = 1


In [62]:
# Преобразуйте результаты предсказаний в Pandas DataFrame
predictions_df = predictions.select("label", "prediction", "probability").toPandas()

# Выведите первые несколько строк DataFrame
print(predictions_df.head(20))


    label  prediction probability
0       1         1.0  [0.0, 1.0]
1       1         1.0  [0.0, 1.0]
2       1         1.0  [0.0, 1.0]
3       1         1.0  [0.0, 1.0]
4       1         1.0  [0.0, 1.0]
5       1         1.0  [0.0, 1.0]
6       1         1.0  [0.0, 1.0]
7       1         1.0  [0.0, 1.0]
8       1         1.0  [0.0, 1.0]
9       1         1.0  [0.0, 1.0]
10      1         1.0  [0.0, 1.0]
11      1         1.0  [0.0, 1.0]
12      1         1.0  [0.0, 1.0]
13      1         1.0  [0.0, 1.0]
14      1         1.0  [0.0, 1.0]
15      1         1.0  [0.0, 1.0]
16      1         1.0  [0.0, 1.0]
17      1         1.0  [0.0, 1.0]
18      1         1.0  [0.0, 1.0]
19      1         1.0  [0.0, 1.0]


In [63]:
# Останавливаем сессию Spark
spark.stop()