In [16]:
pip install findspark

Defaulting to user installation because normal site-packages is not writeable
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
from pyspark.sql import SparkSession
import zipfile
import random
from decimal import Decimal
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType

import findspark
findspark.init()


spark = SparkSession.builder \
    .master("local[2]") \
    .appName("Create-DataFrame") \
    .config("spark.executorEnv.PYSPARK_PYTHON", "python3") \
    .config("spark.executorEnv.PYSPARK_DRIVER_PYTHON", "python3") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "6g") \
    .getOrCreate()

sc = spark.sparkContext

import requests

url = "https://archive.ics.uci.edu/static/public/911/recipe+reviews+and+user+feedback+dataset.zip"
response = requests.get(url)
with open("recipe_reviews.zip", "wb") as file:
    file.write(response.content)

data_folder = "./data"
with zipfile.ZipFile("recipe_reviews.zip", 'r') as zip_ref:
    zip_ref.extractall(data_folder)

df_reviews = spark.read.csv(f'{data_folder}/Recipe Reviews and User Feedback Dataset.csv', header=True, sep=',', inferSchema=True)
df_reviews.show(5)
df_reviews.printSchema()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/11 00:19:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+---+-------------+-----------+------------------+--------------------+--------------+----------+---------------+----------+-----------+---------+-----------+-----+----------+--------------------+
|_c0|recipe_number|recipe_code|       recipe_name|          comment_id|       user_id| user_name|user_reputation|created_at|reply_count|thumbs_up|thumbs_down|stars|best_score|                text|
+---+-------------+-----------+------------------+--------------------+--------------+----------+---------------+----------+-----------+---------+-----------+-----+----------+--------------------+
|  0|          001|      14299|Creamy White Chili|sp_aUSaElGf_14299...|u_9iFLIhMa8QaG|   Jeri326|              1|1665619889|          0|        0|          0|    5|       527|I tweaked it a li...|
|  1|          001|      14299|Creamy White Chili|sp_aUSaElGf_14299...|u_Lu6p25tmE77j|   Mark467|             50|1665277687|          0|        7|          0|    5|       724|Bush used to have...|
|  2|          

25/01/11 00:19:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , recipe_number, recipe_code, recipe_name, comment_id, user_id, user_name, user_reputation, created_at, reply_count, thumbs_up, thumbs_down, stars, best_score, text
 Schema: _c0, recipe_number, recipe_code, recipe_name, comment_id, user_id, user_name, user_reputation, created_at, reply_count, thumbs_up, thumbs_down, stars, best_score, text
Expected: _c0 but found: 
CSV file: file:///opt/spark/work-dir/data/Recipe%20Reviews%20and%20User%20Feedback%20Dataset.csv


In [6]:
# Zadanie 1
df_reviews = df_reviews.withColumnRenamed("_c0", "id")

df_reviews.select("reply_count").orderBy("reply_count", ascending=False).show(10)

df_reviews.groupBy("recipe_code").sum("best_score").orderBy("sum(best_score)", ascending=False).show(10)

df_reviews.groupBy("recipe_code").count().orderBy("count", ascending=False).show(10)

df_reviews.groupBy("stars").count().orderBy("stars").show()

+-----------+
|reply_count|
+-----------+
|          3|
|          3|
|          3|
|          3|
|          3|
|          3|
|          2|
|          2|
|          2|
|          2|
+-----------+
only showing top 10 rows

+-----------+---------------+
|recipe_code|sum(best_score)|
+-----------+---------------+
|       2832|          98863|
|      14299|          85497|
|      17826|          64880|
|       3309|          64247|
|      21444|          60755|
|      32480|          59867|
|      12540|          59195|
|       2912|          54032|
|      42083|          51975|
|      19731|          47905|
+-----------+---------------+
only showing top 10 rows

+-----------+-----+
|recipe_code|count|
+-----------+-----+
|       2832|  725|
|      14299|  654|
|       3309|  509|
|      42083|  421|
|      32480|  397|
|      21444|  395|
|      12540|  368|
|      17826|  338|
|       2912|  332|
|      19731|  324|
+-----------+-----+
only showing top 10 rows

+-----+-----+
|stars|count

In [7]:
data_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("firstname", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("salary", DecimalType(10, 2), True)
])

firstnames = ["Adam", "Katarzyna", "Krzysztof", "Marek", "Aleksandra", "Zbigniew", "Wojciech", "Mieczysław", "Agata", "Wisława"]
lastnames = ["Mieczykowski", "Kowalski", "Malinowski", "Szczaw", "Glut", "Barański", "Brzęczyszczykiewicz", "Wróblewski", "Wlotka", "Pysla"]
age_range = (18, 68)
salary_range = (3200, 12500)

data = [
    (i, random.choice(firstnames), random.choice(lastnames), random.randint(*age_range), Decimal(round(random.uniform(*salary_range), 2)))
    for i in range(1, 1001)
]

df_employee = spark.createDataFrame(data, schema=data_schema)
df_employee.show(10)

[Stage 13:>                                                         (0 + 1) / 1]

+---+----------+-------------------+---+--------+
| id| firstname|           lastname|age|  salary|
+---+----------+-------------------+---+--------+
|  1|  Zbigniew|         Wróblewski| 48|11138.91|
|  2|Aleksandra|             Szczaw| 52| 7716.22|
|  3|  Zbigniew|         Wróblewski| 40|11314.07|
|  4|Aleksandra|              Pysla| 37| 9796.99|
|  5|      Adam|             Wlotka| 38| 9545.44|
|  6|Aleksandra|Brzęczyszczykiewicz| 67| 8579.71|
|  7|Mieczysław|           Kowalski| 38| 5369.43|
|  8|     Marek|       Mieczykowski| 46| 9552.36|
|  9|      Adam|         Wróblewski| 43| 5876.42|
| 10|  Wojciech|         Malinowski| 19|10499.58|
+---+----------+-------------------+---+--------+
only showing top 10 rows



                                                                                

In [8]:
pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
import time
start_time = time.time()
df_employee.filter(df_employee["salary"] > 10000).count()
print(f"Czas wykonania: {time.time() - start_time} sekund")

Czas wykonania: 0.27700138092041016 sekund


In [11]:
def bucketize_age(df, input_col, output_col):
    from pyspark.ml.feature import Bucketizer
    splits = [10, 20, 30, 40, 50, 60, 70]
    bucketizer = Bucketizer(splits=splits, inputCol=input_col, outputCol=output_col)
    return bucketizer.transform(df)

df_employee = bucketize_age(df_employee, "age", "age_bucket")
df_employee.select("age", "age_bucket").show(20)

age_distribution = df_employee.groupBy("age_bucket").count().orderBy("age_bucket")
age_distribution.show()

sc.stop()


+---+----------+
|age|age_bucket|
+---+----------+
| 48|       3.0|
| 52|       4.0|
| 40|       3.0|
| 37|       2.0|
| 38|       2.0|
| 67|       5.0|
| 38|       2.0|
| 46|       3.0|
| 43|       3.0|
| 19|       0.0|
| 48|       3.0|
| 26|       1.0|
| 32|       2.0|
| 34|       2.0|
| 31|       2.0|
| 62|       5.0|
| 31|       2.0|
| 20|       1.0|
| 33|       2.0|
| 29|       1.0|
+---+----------+
only showing top 20 rows

+----------+-----+
|age_bucket|count|
+----------+-----+
|       0.0|   39|
|       1.0|  177|
|       2.0|  197|
|       3.0|  211|
|       4.0|  208|
|       5.0|  168|
+----------+-----+

