In [2]:
! pip show apache-flink

Name: apache-flink
Version: 1.19.0
Summary: Apache Flink Python API
Home-page: https://flink.apache.org
Author: Apache Software Foundation
Author-email: dev@flink.apache.org
License: https://www.apache.org/licenses/LICENSE-2.0
Location: /Users/rahulsrivastav/anaconda3/lib/python3.11/site-packages
Requires: apache-beam, apache-flink-libraries, avro-python3, cloudpickle, fastavro, httplib2, numpy, pandas, pemja, protobuf, py4j, pyarrow, python-dateutil, pytz, requests, ruamel.yaml
Required-by: 


In [7]:
from pyflink.table import EnvironmentSettings, TableEnvironment
from faker import Faker




# Create a batch TableEnvironment
env_settings = EnvironmentSettings.in_batch_mode()
table_env = TableEnvironment.create(env_settings)

# Initialize Faker
fake = Faker()

# Generate fake data and convert it into a PyFlink table with column names
data = [(fake.name(), fake.city(), fake.state()) for _ in range(10)]  # Generate 10 rows of fake data

# Define column names
column_names = ["name", "city", "state"]

# Create a PyFlink table with column names
table = table_env.from_elements(data, schema=column_names)

# Print the table
table.execute().print()

+--------------------------------+--------------------------------+--------------------------------+
|                           name |                           city |                          state |
+--------------------------------+--------------------------------+--------------------------------+
|                Dennis Hamilton |                     Jasonville |                  Massachusetts |
|                  Maria Collier |             East Kimberlyhaven |                        Georgia |
|                  Joseph Spence |                       Erinfurt |                         Oregon |
|                  Clinton Berry |                      Smithberg |                       Michigan |
|                     Mary Ellis |                      Haashaven |                   North Dakota |
|                 Cassandra Sims |                     East Kelly |                       Illinois |
|                Michelle Thomas |                     Robertside |                        

In [8]:
table_env.create_temporary_view('source_table', table)

table_env.execute_sql(f"SELECT * FROM source_table ").print()

+--------------------------------+--------------------------------+--------------------------------+
|                           name |                           city |                          state |
+--------------------------------+--------------------------------+--------------------------------+
|                Dennis Hamilton |                     Jasonville |                  Massachusetts |
|                  Maria Collier |             East Kimberlyhaven |                        Georgia |
|                  Joseph Spence |                       Erinfurt |                         Oregon |
|                  Clinton Berry |                      Smithberg |                       Michigan |
|                     Mary Ellis |                      Haashaven |                   North Dakota |
|                 Cassandra Sims |                     East Kelly |                       Illinois |
|                Michelle Thomas |                     Robertside |                        

In [9]:
from pyflink.table.expressions import col

table \
    .select(col("name"), col("city"), col("state")) \
    .where(col("state") == 'Vermont') \
    .execute().print()

Empty set


In [10]:
table.select(col("name"), col("city")).execute().print()

+--------------------------------+--------------------------------+
|                           name |                           city |
+--------------------------------+--------------------------------+
|                Dennis Hamilton |                     Jasonville |
|                  Maria Collier |             East Kimberlyhaven |
|                  Joseph Spence |                       Erinfurt |
|                  Clinton Berry |                      Smithberg |
|                     Mary Ellis |                      Haashaven |
|                 Cassandra Sims |                     East Kelly |
|                Michelle Thomas |                     Robertside |
|                    Amanda Wong |                      East Eric |
|               Virginia Bentley |                 Lake Territown |
|                   Jeffery Vega |                  South Timothy |
+--------------------------------+--------------------------------+
10 rows in set


In [11]:
table_env.execute_sql("""
    CREATE TABLE print_sink (
        name STRING, 
        city STRING,
        state STRING
    ) WITH (
        'connector' = 'print'
    )
""")

table_env.execute_sql("""
    INSERT INTO print_sink
        SELECT * FROM source_table
""").wait()

1> +I[Dennis Hamilton, Jasonville, Massachusetts]
1> +I[Maria Collier, East Kimberlyhaven, Georgia]
1> +I[Joseph Spence, Erinfurt, Oregon]
1> +I[Clinton Berry, Smithberg, Michigan]
1> +I[Mary Ellis, Haashaven, North Dakota]
1> +I[Cassandra Sims, East Kelly, Illinois]
1> +I[Michelle Thomas, Robertside, Alaska]
1> +I[Amanda Wong, East Eric, Delaware]
1> +I[Virginia Bentley, Lake Territown, North Carolina]
1> +I[Jeffery Vega, South Timothy, Arizona]


In [12]:
pandas_df = table.to_pandas()
pandas_df

Unnamed: 0,name,city,state
0,Dennis Hamilton,Jasonville,Massachusetts
1,Maria Collier,East Kimberlyhaven,Georgia
2,Joseph Spence,Erinfurt,Oregon
3,Clinton Berry,Smithberg,Michigan
4,Mary Ellis,Haashaven,North Dakota
5,Cassandra Sims,East Kelly,Illinois
6,Michelle Thomas,Robertside,Alaska
7,Amanda Wong,East Eric,Delaware
8,Virginia Bentley,Lake Territown,North Carolina
9,Jeffery Vega,South Timothy,Arizona


In [13]:
from elasticsearch import Elasticsearch, helpers

# Connect to the Elasticsearch server
es = Elasticsearch(
    hosts=["http://localhost:9200"]
)

# Check if the connection is established
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")

Connected to Elasticsearch


  if es.ping():


In [14]:
def df_to_elasticsearch(df, index_name):
    # Prepare the data for Elasticsearch
    records = df.to_dict(orient='records')
    actions = [
        {
            "_index": index_name,
            "_id": record['name'],  # Optionally specify a document ID
            "_source": record
        }
        for record in records
    ]
    # Bulk index the data
    helpers.bulk(es, actions)

In [15]:
# Create an index and index the DataFrame
index_name = 'test-index-pandas-v1'
es.indices.create(index=index_name, ignore=400)
df_to_elasticsearch(pandas_df, index_name)

# Refresh the index to make the documents searchable
es.indices.refresh(index=index_name)

# Search the index to verify
res = es.search(index=index_name, query={"match_all": {}})
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print(hit["_source"])

Got 20 Hits:
{'name': 'Bradley Johnston', 'city': 'North Brittany', 'state': 'Tennessee'}
{'name': 'Brittany Terry', 'city': 'Burnettville', 'state': 'Virginia'}
{'name': 'Jack Olson', 'city': 'Tinabury', 'state': 'Alabama'}
{'name': 'Sharon Galvan', 'city': 'Evelynbury', 'state': 'Louisiana'}
{'name': 'Judith Wiggins', 'city': 'Jenkinsborough', 'state': 'Massachusetts'}
{'name': 'Robert Williams', 'city': 'North Gary', 'state': 'Colorado'}
{'name': 'Louis Baker', 'city': 'Kristopherberg', 'state': 'Missouri'}
{'name': 'Erin Alvarez', 'city': 'Port Tom', 'state': 'Vermont'}
{'name': 'Nicholas Martinez', 'city': 'Bensonton', 'state': 'Washington'}
{'name': 'Heather Smith', 'city': 'Kelseyborough', 'state': 'Minnesota'}


  es.indices.create(index=index_name, ignore=400)
  es.indices.create(index=index_name, ignore=400)
  helpers.bulk(es, actions)
  es.indices.refresh(index=index_name)
  res = es.search(index=index_name, query={"match_all": {}})


In [16]:
indices = es.indices.get_alias(index="*")
print("List of indices:")
for index in indices:
    print(index)

List of indices:
.kibana_7.17.21_001
.kibana-event-log-7.17.21-000001
.apm-custom-link
.kibana_task_manager_7.17.21_001
.apm-agent-configuration
test-index-pandas-v1


  indices = es.indices.get_alias(index="*")
  indices = es.indices.get_alias(index="*")


In [17]:
import os

In [18]:
CURRENT_DIR = os.getcwd()  # Get the current working directory

In [20]:
CURRENT_DIR

'/Users/rahulsrivastav/Documents/Notebook'

In [21]:
table_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///" + CURRENT_DIR + "/flink_jars/flink-sql-connector-elasticsearch7-3.0.1-1.17.jar",
)

<pyflink.common.configuration.Configuration at 0x13b422390>

In [32]:
# from pyflink.datastream import StreamExecutionEnvironment
# from pyflink.table import StreamTableEnvironment, DataTypes
# from elasticsearch import Elasticsearch

In [33]:
# env = StreamExecutionEnvironment.get_execution_environment()
# t_env = StreamTableEnvironment.create(env)

In [34]:
# es_hosts = [{"host": "localhost", "port": 9200}]
# es_index = "your-index"

# # Function to write data to Elasticsearch
# def write_to_es(record):
#     es = Elasticsearch(hosts=es_hosts)
#     # Write record to Elasticsearch
#     es.index(index=es_index, body=record)

# # Stream Table API: Convert your table to a stream and write to Elasticsearch
# t_env.to_append_stream('print_sink', DataTypes.ROW())

# # Execute the job
# env.execute("Write to Elasticsearch")

In [35]:
# Define sink table DDL
sink_ddl = """
    CREATE OR REPLACE TABLE sink_table2(
        name VARCHAR,
        city VARCHAR,
        state VARCHAR
    ) WITH (        
        'connector' = 'elasticsearch-7',
        'index' = 'demo_kafka_flink_streaming_1',
        'hosts' = 'http://localhost:9200',
        'format' = 'json'
    )
"""

# Execute DDL statements to create tables
# t_env.execute_sql(source_ddl)
table_env.execute_sql(sink_ddl)

<pyflink.table.table_result.TableResult at 0x13b535b50>

In [36]:
table_env.execute_sql("show tables").print()

+--------------+
|   table name |
+--------------+
|   print_sink |
|  sink_table2 |
| source_table |
+--------------+
3 rows in set


In [43]:
table_env.execute_sql("select * from source_table").print()

+--------------------------------+--------------------------------+--------------------------------+
|                           name |                           city |                          state |
+--------------------------------+--------------------------------+--------------------------------+
|                Dennis Hamilton |                     Jasonville |                  Massachusetts |
|                  Maria Collier |             East Kimberlyhaven |                        Georgia |
|                  Joseph Spence |                       Erinfurt |                         Oregon |
|                  Clinton Berry |                      Smithberg |                       Michigan |
|                     Mary Ellis |                      Haashaven |                   North Dakota |
|                 Cassandra Sims |                     East Kelly |                       Illinois |
|                Michelle Thomas |                     Robertside |                        

In [44]:
table_env.execute_sql(f"insert into sink_table2 SELECT * FROM source_table ").wait()

In [42]:
# table_env.execute_sql("select * from sink_table2").print()