In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = (
    SparkSession
        .builder
        .master("local")
        .appName("Data Sources")
        .config("spark.jars", "jars/postgresql-42.7.2.jar")
        .config("spark.driver.memory", "16g")
        .getOrCreate()
)

24/03/11 13:55:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Драйвер

In [3]:
driver = "org.postgresql.Driver"
url = "jdbc:postgresql://localhost:5432/spark"
user = "postgres"
password = "postgres"

## Чтение таблицы целиком

### Пример 1

In [4]:
employees_df = spark.read. \
    format("jdbc"). \
    option("driver", driver). \
    option("url", url). \
    option("user", user). \
    option("password", password). \
    option("dbtable", "public.employees"). \
    load()

employees_df.count()

30003

In [5]:
employees_df.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 10010|1963-06-01| Duangkaew|   Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|    Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|    Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|    Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|     Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven| Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|       Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|    Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|  Haraldson|     F|1987-09-21|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



### Пример 2

In [6]:
DBPARAMS = {
    "user": user,
    "password": password,
    "driver": driver
}

In [7]:
df = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS)
df.count()

30003

In [8]:
df.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 10010|1963-06-01| Duangkaew|   Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|    Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|    Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|    Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|     Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven| Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|       Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|    Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|  Haraldson|     F|1987-09-21|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



In [9]:
df.rdd.getNumPartitions()

1

In [10]:
df.agg(F.min(F.col("emp_no")), F.max(F.col("emp_no"))).show()

+-----------+-----------+
|min(emp_no)|max(emp_no)|
+-----------+-----------+
|      10010|     499990|
+-----------+-----------+



### Как распараллелить чтение?

In [11]:
df101 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, numPartitions = 10)
df101.count()

30003

In [12]:
df101.rdd.getNumPartitions()

1

In [13]:
df102 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS,
                        column="emp_no", lowerBound = 10010, upperBound = 499990, numPartitions = 10)
df102.count()

30003

In [14]:
df102.rdd.getNumPartitions()

10

In [15]:
employees_pruned = """(select e.first_name, e.last_name, e.hire_date from public.employees e where e.gender = 'F') as new_emp"""
df_pruned = spark.read.jdbc(url=url, table=employees_pruned, properties=DBPARAMS)
df_pruned.count()

12091

In [16]:
df_pruned.show(10)

+----------+----------+----------+
|first_name| last_name| hire_date|
+----------+----------+----------+
| Duangkaew|  Piveteau|1989-08-24|
|     Weiyi|   Meriste|1993-02-14|
|  Hironobu| Haraldson|1987-09-21|
|    Xuejia|    Ullian|1986-08-22|
|    Armond|Fairtlough|1996-07-06|
|     Yucel|     Auria|1991-03-14|
|  Zhenbing|     Perng|1986-11-16|
|   Kasturi|  Jenevein|1986-01-02|
|      Arve|Fairtlough|1986-06-23|
|      Kish| Fasbender|1992-06-25|
+----------+----------+----------+
only showing top 10 rows



## Предикаты

### Пример 1

In [17]:
pred = ["gender = 'M'", "gender = 'F'"]

df_pred = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred)
df_pred.count()

30003

In [18]:
df_pred.rdd.getNumPartitions()

2

In [19]:
pred1 = ["gender = 'F'"]

df_pred1 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred1)
df_pred1.count()

12091

In [20]:
df_pred1.rdd.getNumPartitions()

1

In [21]:
pred3 = ["gender = 'F'", "gender = 'M'", "gender = 'M'"]

df_pred3 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred3)
df_pred3.count()

47915

In [22]:
df_pred3.rdd.getNumPartitions()

3

In [23]:
df.groupBy(F.col("gender")).agg(F.count(F.col("emp_no"))).show()

+------+-------------+
|gender|count(emp_no)|
+------+-------------+
|     F|        12091|
|     M|        17912|
+------+-------------+



In [24]:
df_pred3.groupBy(F.col("gender")).agg(F.count(F.col("emp_no"))).show()

+------+-------------+
|gender|count(emp_no)|
+------+-------------+
|     F|        12091|
|     M|        35824|
+------+-------------+



### Пример 2

In [25]:
pred2 = ["emp_no > 20000 and emp_no <= 50000", "emp_no >= 50000 and emp_no <= 100000"]

df_pred2 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred2)
df_pred2.count()

8001

In [26]:
df_pred2.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 20010|1961-01-26|    Saniya|     Veccia|     M|1997-06-16|
| 20020|1962-07-22|     Akeel|     Covnot|     F|1996-03-02|
| 20030|1962-05-09|    Nitsan|Hoppenstand|     F|1988-11-18|
| 20040|1962-04-16|   Youjian|    Vingron|     M|1987-01-21|
| 20050|1959-02-27|  Guoxiang|   Greibach|     F|1991-03-18|
| 20060|1954-07-18|    Chrisa|Attimonelli|     F|1985-10-16|
| 20070|1959-04-13|       Tse|    Bellone|     M|1992-07-08|
| 20080|1962-03-22|   Odoardo|  Heiserman|     F|1991-07-01|
| 20090|1957-05-20| Serenella|   Kaltofen|     F|1986-06-24|
| 20100|1965-01-18|       Utz|     Heuter|     F|1986-06-15|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows

