# Примеры чтения данных из таблицы greenplum

## Чтение средствами psycopg2

In [9]:
import psycopg2
import os
import pandas

In [10]:
# подключение к базе данных
HOST=os.environ['GREENPLUM_HOST']
PORT=os.environ['GREENPLUM_PORT']
DATABASE=os.environ['GREENPLUM_DATABASE']
SCHEMA = os.environ['GREENPLUM_SCHEMA']
LOGIN=os.environ['GREENPLUM_LOGIN']
PASSWORD=os.environ['GREENPLUM_PASSWORD']

connection = psycopg2.connect( host=HOST, port=PORT, database=DATABASE, user=LOGIN, password=PASSWORD)

In [11]:
# имя таблицы 
tableName = 'table1'

In [12]:
connection = psycopg2.connect( host=HOST, port=PORT, database=DATABASE, user=LOGIN, password=PASSWORD)
sql = f"""
     select * FROM {SCHEMA}.{tableName}
"""
cur = connection.cursor()
cur.execute(sql)
data = cur.fetchall()
print(data)
connection.commit()
cur.close()
connection.close()

[(2, 2.0, '2'), (3, 3.0, '3'), (6, 6.0, '6'), (4, 4.0, '4'), (1, 1.0, '1'), (5, 5.0, '5')]


In [13]:
# Перводим в pandas
pdf = pandas.DataFrame(data)
pdf.columns = [desc[0] for desc in cur.description]
display(pdf)

Unnamed: 0,c1,c2,c3
0,2,2.0,2
1,3,3.0,3
2,6,6.0,6
3,4,4.0,4
4,1,1.0,1
5,5,5.0,5


In [15]:
# не забываем закрывать соединение
connection.close()

## Чтение средствами spark

In [14]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, LongType, StringType, IntegerType, DateType, TimestampType, FloatType
from pyspark.sql.functions import col, cast, date_trunc, sum, dayofweek, hour, dayofmonth, lit

In [16]:
spark = SparkSession.builder.appName('fill').getOrCreate()
jdbcURLGreenplum = f'jdbc:postgresql://{HOST}:{PORT}/{DATABASE}?user={LOGIN}&password={PASSWORD}'

https://packages.confluent.io/maven/ added as a remote repository with the name: repo-1
Ivy Default Cache set to: /home/sdl/.ivy2/cache
The jars for the packages stored in: /home/sdl/.ivy2/jars
org.tarantool#connector added as a dependency
org.apache.spark#spark-streaming_2.13 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
za.co.absa#abris_2.13 added as a dependency
com.typesafe#config added as a dependency
org.apache.spark#spark-avro_2.13 added as a dependency
org.apache.spark#spark-streaming-kafka-0-10_2.13 added as a dependency
com.ibm.icu#icu4j added as a dependency
org.apache.commons#commons-lang3 added as a dependency
com.oracle.database.jdbc#ojdbc10 added as a dependency
org.postgresql#postgresql added as a dependency
com.clickhouse#clickhouse-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a4475061-ed0a-47ad-a91d-29d01d5688d7;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.tarantool#connector;1.9.4 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.13;3.4.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;3.4.0 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.9.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.0.4 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
	found za.co.absa#abris_2.13;6.3.0 in central
	found io.confluent#kafka-avro-serializer;6.2.1 in repo-1
	found org.apache.avro#avro;1.10.1 in central
	found com.fasterxml.jackson.core#jackson-core;2.10.5 in central
	found com.fasterxml.ja

In [19]:
# способ 1
# в этом варианте все данные из таблицы будут скачены в кеш спарка, после применится условие фильтрации
df1 = spark.read.format("jdbc") \
              .option('url', jdbcURLGreenplum) \
              .option('driver', 'org.postgresql.Driver') \
              .option('dbtable',SCHEMA+'.'+tableName).load()
df1.where('c1>4').show()


+---+---+---+
| c1| c2| c3|
+---+---+---+
|  5|5.0|  5|
|  6|6.0|  6|
+---+---+---+



In [20]:
# способ 2
# в этом варианте в спар поступают уже отфильтрованные данные
query=f'''
SELECT  *
FROM {SCHEMA}.{tableName}
WHERE c1>4
'''
df1 = spark.read.format("jdbc") \
              .option('url', jdbcURLGreenplum) \
              .option('driver', 'org.postgresql.Driver') \
              .option('query',query).load()
df1.show()

+---+---+---+
| c1| c2| c3|
+---+---+---+
|  6|6.0|  6|
|  5|5.0|  5|
+---+---+---+



In [21]:
# переводим в pandas
display(df1.toPandas())

Unnamed: 0,c1,c2,c3
0,6,6.0,6
1,5,5.0,5


In [None]:
# желательно остановить спарк
spark.stop()