<a href="https://colab.research.google.com/github/SkyRanger2010/DE2024_PY/blob/main/HomeWork_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sdv


In [None]:
from pyspark.sql import functions as F
import time
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
import pandas as pd
from pyspark.sql import SparkSession

path = ''
df = pd.read_csv(path + 'electronic_devices.csv')
df['addons'] = df['addons'].fillna('')
df['addons'] = df['addons'].apply(lambda x: len(x.split(',')))

metadata = Metadata.detect_from_dataframe(
    data=df,
    table_name='electronic_devices')

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(df)

synthetic_data = synthesizer.sample(num_rows=1_000_000)
synthetic_data.to_csv(path + 'generated_electronic_devices.csv', index=False)

In [15]:
# Инициализация SparkSession
spark = SparkSession.builder \
    .appName("Electronic Devices Analysis") \
    .getOrCreate()

spark.sparkContext.setCheckpointDir('/')


# Чтение сгенерированных данных
df_spark = spark.read.csv(
    'generated_electronic_devices.csv', header=True, inferSchema=True)

# Фильтрация данных за 2024-09-03 и с одной дополнительной покупкой
filtered_data = df_spark.filter(
    (df_spark.purchase_date == '2024-09-03') & (df_spark.addons == 1))

# Группировка по полу и возрасту, вычисление разницы между минимальной и максимальной ценой
result = filtered_data.groupBy('gender', 'age').agg(
    (F.max('unit_price') - F.min('unit_price')).alias('price_difference'),
    (F.max('total_price') - F.min('total_price')).alias('order_difference')
)

# Сбор данных в список
result_collected = result.collect()

# Без кэширования
start_time = time.time()
result_no_cache = filtered_data.groupBy('gender', 'age').agg(
    (F.max('unit_price') - F.min('unit_price')).alias('price_difference'),
    (F.max('total_price') - F.min('total_price')).alias('order_difference')
).collect()
no_cache_duration = time.time() - start_time

# С кэшированием
filtered_data.cache()  # Кэширование данных
start_time = time.time()
result_with_cache = filtered_data.groupBy('gender', 'age').agg(
    (F.max('unit_price') - F.min('unit_price')).alias('price_difference'),
    (F.max('total_price') - F.min('total_price')).alias('order_difference')
).collect()
cache_duration = time.time() - start_time

# Сохранение контрольной точки
filtered_data.checkpoint()  # Сохранение контрольной точки
start_time = time.time()
result_with_checkpoint = filtered_data.groupBy('gender', 'age').agg(
    (F.max('unit_price') - F.min('unit_price')).alias('price_difference'),
    (F.max('total_price') - F.min('total_price')).alias('order_difference')
).collect()
checkpoint_duration = time.time() - start_time

print(f"Duration without caching: {no_cache_duration} seconds")
print(f"Duration with caching: {cache_duration} seconds")
print(f"Duration with checkpointing: {checkpoint_duration} seconds")

Duration without caching: 0.7357027530670166 seconds
Duration with caching: 0.35079503059387207 seconds
Duration with checkpointing: 0.5172924995422363 seconds
