In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

Создаём SparkSession. Добавляем путь к драйверу JDBC.

In [2]:
spark = (
    SparkSession
        .builder
        .master("local")
        .appName("JDBC Data Source")
        .config("spark.jars", "jars/postgresql-42.7.2.jar")
        .config("spark.driver.memory", "8g")
        .config("spark.log.level", "WARN")
        .getOrCreate()
)

25/03/13 16:53:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Задаём свойства подключения.

In [3]:
driver = "org.postgresql.Driver"
url = "jdbc:postgresql://localhost:5432/spark"
user = "postgres"
password = "postgres"

## Чтение таблицы целиком

### Вариант 1

In [4]:
employees_df = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.employees") \
    .load()

employees_df.count()

30003

In [5]:
employees_df.printSchema()

root
 |-- emp_no: integer (nullable = true)
 |-- birth_date: date (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hire_date: date (nullable = true)



In [6]:
employees_df.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 10010|1963-06-01| Duangkaew|   Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|    Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|    Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|    Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|     Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven| Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|       Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|    Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|  Haraldson|     F|1987-09-21|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



### Вариант 2

In [7]:
DBPARAMS = {
    "user": user,
    "password": password,
    "driver": driver
}

In [8]:
df = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS)
df.count()

30003

In [9]:
df.printSchema()

root
 |-- emp_no: integer (nullable = true)
 |-- birth_date: date (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hire_date: date (nullable = true)



In [10]:
df.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 10010|1963-06-01| Duangkaew|   Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|    Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|    Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|    Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|     Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven| Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|       Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|    Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|  Haraldson|     F|1987-09-21|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



Проверим количество партиций.

In [11]:
df.rdd.getNumPartitions()

1

## Как распараллелить чтение?

### Партиционирование по столбцам

Добавим количество партиций к параметрам чтения таблицы.

In [12]:
df101 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, numPartitions=10)

print("count = ", df101.count())
print("num partitions = ", df101.rdd.getNumPartitions())

count =  30003
num partitions =  1


Количество партиций не изменилось.

Узнаем минимальное и максимальное значения столбца *emp_no*

In [13]:
df.agg(min(col("emp_no")), max(col("emp_no"))).show()

+-----------+-----------+
|min(emp_no)|max(emp_no)|
+-----------+-----------+
|      10010|     499990|
+-----------+-----------+



In [14]:
min_emp_no = df.agg(min(col("emp_no"))).collect()[0][0]
max_emp_no = df.agg(max(col("emp_no"))).collect()[0][0]

print("min = ", min_emp_no, "\nmax = ", max_emp_no)

min =  10010 
max =  499990


In [15]:
df102 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS,
                        column="emp_no", lowerBound = min_emp_no, upperBound = max_emp_no, numPartitions=10)

print("count = ", df102.count())
print("num partitions = ", df102.rdd.getNumPartitions())

count =  30003
num partitions =  10


In [16]:
df102.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 10010|1963-06-01| Duangkaew|   Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|    Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|    Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|    Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|     Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven| Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|       Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|    Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|  Haraldson|     F|1987-09-21|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



Посмотрим сколько записей попало в каждую партицию

In [17]:
df102.rdd.foreachPartition(lambda p: print("Partition count = ", len(list(p))))

Partition count =  4900                                            (0 + 1) / 10]
Partition count =  4900                                            (1 + 1) / 10]
Partition count =  203
Partition count =  601
Partition count =  4899
Partition count =  0
Partition count =  200
Partition count =  4900
                                                                                

Это также можно получить другим способом

In [18]:
pl = df102.rdd.mapPartitionsWithIndex(lambda p, i: (p, len(list(i)))).collect()
list(zip(pl[::2], pl[1::2]))

                                                                                

[(0, 4900),
 (1, 4900),
 (2, 203),
 (3, 601),
 (4, 4899),
 (5, 4500),
 (6, 0),
 (7, 200),
 (8, 4900),
 (9, 4900)]

Зададим в качестве *lowerBound* и *upperBound* произвольные значения (не min и max)

In [19]:
df103 = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.employees") \
    .option("partitionColumn", "emp_no") \
    .option("lowerBound", "20000") \
    .option("upperBound", "50000") \
    .option("numPartitions", "10") \
    .load()

print("count = ", df103.count())
print("num partitions = ", df103.rdd.getNumPartitions())

count =  30003
num partitions =  10


Посмотрим сколько теперь записей попало в каждую партицию

In [20]:
pl2 = df103.rdd.mapPartitionsWithIndex(lambda p, i: (p, len(list(i)))).collect()
list(zip(pl2[::2], pl2[1::2]))

[(0, 1299),
 (1, 300),
 (2, 300),
 (3, 300),
 (4, 300),
 (5, 300),
 (6, 300),
 (7, 300),
 (8, 300),
 (9, 26304)]

In [21]:
df103.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 10010|1963-06-01| Duangkaew|   Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|    Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|    Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|    Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|     Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven| Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|       Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|    Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|  Haraldson|     F|1987-09-21|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



### Партиционирование по предикатам

### Пример 1

Опредилим **два** предиката по значению столбца *gender*

In [22]:
pred = ["gender = 'M'", "gender = 'F'"]

df_pred = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred)

print("count = ", df_pred.count())
print("num partitions = ", df_pred.rdd.getNumPartitions())

count =  30003
num partitions =  2


Опредилим **один** предиката по одному значению столбца *gender*

In [23]:
pred1 = ["gender = 'F'"]

df_pred1 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred1)

print("count = ", df_pred1.count())
print("num partitions = ", df_pred1.rdd.getNumPartitions())

count =  12091
num partitions =  1


Опредилим **три** предиката по значению столбца *gender*

In [24]:
pred3 = ["gender = 'F'", "gender = 'M'", "gender = 'M'"]

df_pred3 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred3)

print("count = ", df_pred3.count())
print("num partitions = ", df_pred3.rdd.getNumPartitions())

count =  47915
num partitions =  3


Опредилим **четыре** предиката по значению столбца *gender*

In [25]:
pred4 = ["gender = 'F'", "gender = 'F'", "gender = 'M'", "gender = 'M'"]

df_pred4 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred4)

print("count = ", df_pred4.count())
print("num partitions = ", df_pred4.rdd.getNumPartitions())

count =  60006
num partitions =  4


Посмотрим сколько записей для каждого значения столбца *gender* было в исходной таблице

In [26]:
df.groupBy(col("gender")).agg(count(col("emp_no"))).show()

+------+-------------+
|gender|count(emp_no)|
+------+-------------+
|     F|        12091|
|     M|        17912|
+------+-------------+



Сравним с количеством записей при применении трёх и четырёх предикатов

In [27]:
df_pred3.groupBy(col("gender")).agg(count(col("emp_no"))).show()

+------+-------------+
|gender|count(emp_no)|
+------+-------------+
|     F|        12091|
|     M|        35824|
+------+-------------+



In [28]:
df_pred4.groupBy(col("gender")).agg(count(col("emp_no"))).show()

+------+-------------+
|gender|count(emp_no)|
+------+-------------+
|     F|        24182|
|     M|        35824|
+------+-------------+



### Пример 2

Определим **два** предиката по условиям на значения столбца *emp_no* 

In [29]:
pred2 = ["emp_no > 20000 and emp_no <= 50000", "emp_no >= 50000 and emp_no <= 100000"]

df_pred2 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred2)

print("count = ", df_pred2.count())
print("num partitions = ", df_pred2.rdd.getNumPartitions())

count =  8001
num partitions =  2


In [30]:
df_pred2.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 20010|1961-01-26|    Saniya|     Veccia|     M|1997-06-16|
| 20020|1962-07-22|     Akeel|     Covnot|     F|1996-03-02|
| 20030|1962-05-09|    Nitsan|Hoppenstand|     F|1988-11-18|
| 20040|1962-04-16|   Youjian|    Vingron|     M|1987-01-21|
| 20050|1959-02-27|  Guoxiang|   Greibach|     F|1991-03-18|
| 20060|1954-07-18|    Chrisa|Attimonelli|     F|1985-10-16|
| 20070|1959-04-13|       Tse|    Bellone|     M|1992-07-08|
| 20080|1962-03-22|   Odoardo|  Heiserman|     F|1991-07-01|
| 20090|1957-05-20| Serenella|   Kaltofen|     F|1986-06-24|
| 20100|1965-01-18|       Utz|     Heuter|     F|1986-06-15|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



Определим **один** предикат по условию на значения столбца *emp_no* 

In [31]:
pred22 = ["emp_no > 20000 and emp_no <= 50000"]

df_pred22 = spark.read.jdbc(url=url, table="public.employees", properties=DBPARAMS, predicates=pred22)

print("count = ", df_pred22.count())
print("num partitions = ", df_pred22.rdd.getNumPartitions())

count =  3000
num partitions =  1


## Фильтрация

Выполним запрос к базе на выборку значений из таблицы с условием

In [32]:
q = """select * from public.employees where emp_no > 20000 and emp_no <= 50000"""

dfq = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("query", q) \
    .load()

dfq.count()

3000

In [33]:
dfq.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 20010|1961-01-26|    Saniya|     Veccia|     M|1997-06-16|
| 20020|1962-07-22|     Akeel|     Covnot|     F|1996-03-02|
| 20030|1962-05-09|    Nitsan|Hoppenstand|     F|1988-11-18|
| 20040|1962-04-16|   Youjian|    Vingron|     M|1987-01-21|
| 20050|1959-02-27|  Guoxiang|   Greibach|     F|1991-03-18|
| 20060|1954-07-18|    Chrisa|Attimonelli|     F|1985-10-16|
| 20070|1959-04-13|       Tse|    Bellone|     M|1992-07-08|
| 20080|1962-03-22|   Odoardo|  Heiserman|     F|1991-07-01|
| 20090|1957-05-20| Serenella|   Kaltofen|     F|1986-06-24|
| 20100|1965-01-18|       Utz|     Heuter|     F|1986-06-15|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



## Соединения в базе

Выполним запрос к базе на выборку значений из соединения таблиц

In [34]:
qj = """select e.emp_no, birth_date, first_name, last_name, gender, hire_date, salary, from_date, to_date
from employees e join salaries s on e.emp_no = s.emp_no"""

dfj = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("query", qj) \
    .load()

dfj.count()

283827

In [35]:
dfj.show()

+------+----------+----------+---------+------+----------+------+----------+----------+
|emp_no|birth_date|first_name|last_name|gender| hire_date|salary| from_date|   to_date|
+------+----------+----------+---------+------+----------+------+----------+----------+
| 10010|1963-06-01| Duangkaew| Piveteau|     F|1989-08-24| 72488|1996-11-24|1997-11-24|
| 10010|1963-06-01| Duangkaew| Piveteau|     F|1989-08-24| 74347|1997-11-24|1998-11-24|
| 10010|1963-06-01| Duangkaew| Piveteau|     F|1989-08-24| 75405|1998-11-24|1999-11-24|
| 10010|1963-06-01| Duangkaew| Piveteau|     F|1989-08-24| 78194|1999-11-24|2000-11-23|
| 10010|1963-06-01| Duangkaew| Piveteau|     F|1989-08-24| 79580|2000-11-23|2001-11-23|
| 10010|1963-06-01| Duangkaew| Piveteau|     F|1989-08-24| 80324|2001-11-23|9999-01-01|
| 10020|1952-12-24|    Mayuko|  Warwick|     M|1991-01-26| 40000|1997-12-30|1998-12-30|
| 10020|1952-12-24|    Mayuko|  Warwick|     M|1991-01-26| 40647|1998-12-30|1999-12-30|
| 10020|1952-12-24|    Mayuko|  

## Запись в таблицу

Посмотрим на таблицу *employees*

In [36]:
employees_df.show(10)

+------+----------+----------+-----------+------+----------+
|emp_no|birth_date|first_name|  last_name|gender| hire_date|
+------+----------+----------+-----------+------+----------+
| 10010|1963-06-01| Duangkaew|   Piveteau|     F|1989-08-24|
| 10020|1952-12-24|    Mayuko|    Warwick|     M|1991-01-26|
| 10030|1958-07-14|     Elvis|    Demeyer|     M|1994-02-17|
| 10040|1959-09-13|     Weiyi|    Meriste|     F|1993-02-14|
| 10050|1958-05-21|   Yinghua|     Dredge|     M|1990-12-25|
| 10060|1961-10-15|  Breannda|Billingsley|     M|1987-11-02|
| 10070|1955-08-20|    Reuven| Garigliano|     M|1985-10-14|
| 10080|1957-12-03|    Premal|       Baek|     M|1985-11-19|
| 10090|1961-05-30|    Kendra|    Hofting|     M|1986-03-14|
| 10100|1953-04-21|  Hironobu|  Haraldson|     F|1987-09-21|
+------+----------+----------+-----------+------+----------+
only showing top 10 rows



Загрузим таблицу *salaries*

In [37]:
salaries_df = spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.salaries") \
    .load()

salaries_df.count()

283827

In [38]:
salaries_df.show(10)

+------+------+----------+----------+
|emp_no|salary| from_date|   to_date|
+------+------+----------+----------+
| 10010| 72488|1996-11-24|1997-11-24|
| 10010| 74347|1997-11-24|1998-11-24|
| 10010| 75405|1998-11-24|1999-11-24|
| 10010| 78194|1999-11-24|2000-11-23|
| 10010| 79580|2000-11-23|2001-11-23|
| 10010| 80324|2001-11-23|9999-01-01|
| 10020| 40000|1997-12-30|1998-12-30|
| 10020| 40647|1998-12-30|1999-12-30|
| 10020| 43800|1999-12-30|2000-12-29|
| 10020| 44927|2000-12-29|2001-12-29|
+------+------+----------+----------+
only showing top 10 rows



Сделаем группировку по колонке *emp_no* и найдём максимальное значение колонки *salary*

In [39]:
employees_salaries_df = salaries_df.groupBy(col("emp_no")).agg(max(col("salary")).alias("max_salary"))

employees_salaries_df.show(10)

+------+----------+
|emp_no|max_salary|
+------+----------+
| 12940|     85425|
| 13840|     41453|
| 14450|     75524|
| 14570|     72506|
| 15790|     79009|
| 17420|     97003|
| 18800|     88342|
| 19530|     47864|
| 21220|     86177|
| 21700|     68115|
+------+----------+
only showing top 10 rows



Создадим новый Dataframe как результат соединения *employees_df* и агрегированного *salaries_df*

In [40]:
employees_salaries_df = employees_df.join(employees_salaries_df, "emp_no")

employees_salaries_df.show()

+------+----------+----------+---------+------+----------+----------+
|emp_no|birth_date|first_name|last_name|gender| hire_date|max_salary|
+------+----------+----------+---------+------+----------+----------+
| 12940|1953-10-25|  Odinaldo|   Farrar|     F|1987-12-12|     85425|
| 13840|1954-11-13|     Remco|    Demke|     M|1992-06-09|     41453|
| 14450|1963-08-01|  Fumitaka|Prochazka|     F|1985-04-26|     75524|
| 14570|1963-07-26|    Chinho|     Bala|     F|1994-11-17|     72506|
| 15790|1960-03-20|     Kokou| Schnabel|     M|1991-04-03|     79009|
| 17420|1964-05-22|     Jinpo|Stamatiou|     F|1987-07-31|     97003|
| 18800|1962-05-21|    Baruch|  Rosiles|     F|1990-07-06|     88342|
| 19530|1953-12-26|     Barry|   Dratva|     M|1997-09-02|     47864|
| 21220|1956-05-11|      Mana|  Murtagh|     M|1991-10-29|     86177|
| 21700|1963-07-12|       Urs|  Plesums|     F|1992-03-07|     68115|
| 27760|1959-12-05|    Zhigen|  Schrift|     M|1991-04-03|     63106|
| 28170|1964-05-19| 

Сохраним новый Dataframe в таблицу в базе. Таблицы с таким именем в базе не было. Она будет создана.

In [41]:
employees_salaries_df.write \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.employees_salaries") \
    .save()

Если таблица с таким именем существовала в базе, то при сохранении надо использовать режим *overwrite*

In [42]:
employees_salaries_df.write \
    .mode("overwrite") \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.employees_salaries") \
    .option("truncate", "true") \
    .save()

In [43]:
spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.employees_salaries") \
    .load() \
    .count()

30003

Если использовать режим *append* содержимое Dataframe будет добавлено в таблицу

In [44]:
employees_salaries_df.write \
    .mode("append") \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.employees_salaries") \
    .save()

In [45]:
spark.read \
    .format("jdbc") \
    .option("driver", driver) \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("dbtable", "public.employees_salaries") \
    .load() \
    .count()

60006