In [1]:
#Устанавливаем pySpark
!pip install pyspark >> None

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

In [11]:
# Создаем SparkSession
spark = SparkSession.builder.appName("BookSalesAnalysis").getOrCreate()

# Читаем данные из файла CSV
books_df = spark.read.csv("books.csv", header=True, inferSchema=True)
books_df.show()


+--------------------+--------------------+----------------+------+------+
|               title|              author|           genre| sales|  year|
+--------------------+--------------------+----------------+------+------+
|                1984|       George Orwell| Science Fiction|5000.0|1949.0|
|The Lord of the R...|      J.R.R. Tolkien|         Fantasy|3000.0|1954.0|
|To Kill a Mocking...|          Harper Lee| Southern Gothic|4000.0|1960.0|
|The Catcher in th...|       J.D. Salinger|           Novel|2000.0|1951.0|
|    The Great Gatsby| F. Scott Fitzgerald|           Novel|4500.0|1925.0|
+--------------------+--------------------+----------------+------+------+



In [12]:
# Книги продажи которых превышают 3000
filtered_df = books_df.filter(col("sales") > 3000)
filtered_df.show()

+--------------------+--------------------+----------------+------+------+
|               title|              author|           genre| sales|  year|
+--------------------+--------------------+----------------+------+------+
|                1984|       George Orwell| Science Fiction|5000.0|1949.0|
|To Kill a Mocking...|          Harper Lee| Southern Gothic|4000.0|1960.0|
|    The Great Gatsby| F. Scott Fitzgerald|           Novel|4500.0|1925.0|
+--------------------+--------------------+----------------+------+------+



In [13]:
# Группируем по жанру и суммируем продажи
grouped_df = filtered_df.groupBy("genre").agg(sum("sales").alias("total_sales"))
grouped_df.show()

+----------------+-----------+
|           genre|total_sales|
+----------------+-----------+
|           Novel|     4500.0|
| Southern Gothic|     4000.0|
| Science Fiction|     5000.0|
+----------------+-----------+



In [14]:
# Сортируем по общему объему продаж
sorted_df = grouped_df.orderBy(col("total_sales").desc())
sorted_df.show()

+----------------+-----------+
|           genre|total_sales|
+----------------+-----------+
| Science Fiction|     5000.0|
|           Novel|     4500.0|
| Southern Gothic|     4000.0|
+----------------+-----------+

