Switch to py 3.10 and spark 3.3.1 (#92)

- Switch our tests to Spark 3.3.1 and Python 3.10 - Drop `six` package cause we don't support Python 2.x anymore - Replace Travis CI with Github Actions - Make our test/dev env compatible with arm64 - Reduce the number of warnings
tubular · Dec 17, 2022 · 77e4c05 · 77e4c05
1 parent 015dbc8
commit 77e4c05
Show file tree

Hide file tree

Showing 21 changed files with 197 additions and 249 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,14 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - run: make test
diff --git a/.travis.yml b/.travis.yml
diff --git a/Dockerfile b/Dockerfile
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-FROM python:3.7
+FROM python:3.10
 
 LABEL maintainer="dev@tubularlabs.com"
 

diff --git a/README.rst b/README.rst
@@ -1,7 +1,7 @@
 Sparkly
 =======
 
-|Sparkly PyPi Version| |Sparkly Build Status| |Documentation Status|
+|Sparkly PyPi Version| |Documentation Status|
 
 Helpers & syntax sugar for PySpark. There are several features to make your life easier:
 
@@ -35,7 +35,7 @@ and write its content to ElasticSearch index::
             'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11',
             'org.elasticsearch:elasticsearch-spark-20_2.11:6.5.4',
         ]
-        
+
 
     if __name__ == '__main__':
         spark = MySession()

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -47,17 +47,12 @@ services:
         condition: service_healthy
 
   cassandra.docker:
-    image: cassandra:2.1.13
-    mem_limit: 500M
-    memswap_limit: 600M
-    environment:
-      MAX_HEAP_SIZE: 500M
-      HEAP_NEWSIZE: 200M
+    image: cassandra:4.1
     healthcheck:
-      test: ps ax | grep cassandra
+      test: ["CMD-SHELL", "[ $$(nodetool statusgossip) = running ]"]
 
   elastic.docker:
-    image: docker.elastic.co/elasticsearch/elasticsearch:7.3.0
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.8
     environment:
       - xpack.security.enabled=false
       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
@@ -69,16 +64,15 @@ services:
       retries: 20
 
   mysql.docker:
-    image: mysql:5.7.11
+    image: mysql:8.0
     environment:
       MYSQL_DATABASE: sparkly_test
-      MYSQL_USER: root
       MYSQL_ALLOW_EMPTY_PASSWORD: "yes"
     healthcheck:
-      test: ps ax | grep mysql
+      test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
 
   kafka.docker:
-    image: confluentinc/cp-kafka:6.2.1
+    image: confluentinc/cp-kafka:7.3.0
     depends_on:
       zookeeper.docker:
         condition: service_healthy
@@ -94,11 +88,11 @@ services:
       test: ps ax | grep kafka
 
   redis.docker:
-    image: redis:3.2.4
+    image: redis:7.0
     expose:
-      - 6379
+      - "6379"
     healthcheck:
-      test: ps ax | grep redis
+      test: ["CMD", "redis-cli", "ping"]
 
   zookeeper.docker:
     image: confluent/zookeeper

diff --git a/requirements.txt b/requirements.txt
@@ -15,4 +15,3 @@
 #
 
 pylru==1.0.9
-six>=1.10.0,<2.0
diff --git a/requirements_extras.txt b/requirements_extras.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-cassandra-driver==3.7.1
+cassandra-driver==3.25.0
 PyMySQL==0.9.3
 kafka-python==2.0.2
 redis==2.10.5

diff --git a/setup.py b/setup.py
@@ -59,7 +59,7 @@
 
     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
     classifiers=[
-        'Development Status :: 3 - Alpha',
+        'Development Status :: 5 - Production/Stable',
 
         # Indicate who your project is intended for
         'Intended Audience :: Developers',
@@ -72,6 +72,8 @@
         # that you indicate whether you support Python 2, Python 3 or both.
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.10',
     ],
 
     # What does your project relate to?
@@ -90,6 +92,12 @@
     install_requires=requirements,
     extras_require={
         'redis': ['redis>=2.10,<3', 'ujson>=1.33,<2'],
-        'test': ['cassandra-driver>=3.7,<3.8', 'PyMySQL>=0.7,<0.10', 'kafka-python>=2.0.2,<2.1', 'redis>=2.10,<3', 'ujson>=1.33,<2'],
+        'test': [
+            'cassandra-driver>=3.25,<3.26',
+            'PyMySQL>=0.7,<0.10',
+            'kafka-python>=2.0.2,<2.1',
+            'redis>=2.10,<3',
+            'ujson>=1.33,<2',
+        ],
     },
 )
diff --git a/sparkly/catalog.py b/sparkly/catalog.py
@@ -234,15 +234,15 @@ def get_database_properties(self, db_name):
         """
         properties = (
             self._spark.sql('DESCRIBE DATABASE EXTENDED {}'.format(db_name))
-            .where(F.col('database_description_item') == 'Properties')
-            .select('database_description_value')
+            .where(F.col('info_name') == 'Properties')
+            .select('info_value')
             .first()
         )
 
         parsed_properties = {}
 
         if properties:
-            for name, value in read_db_properties_format(properties.database_description_value):
+            for name, value in read_db_properties_format(properties.info_value):
                 parsed_properties[name] = value
 
         return parsed_properties

diff --git a/sparkly/functions.py b/sparkly/functions.py
@@ -17,7 +17,6 @@
 from collections import defaultdict
 from functools import reduce
 import operator
-from six import string_types
 
 from pyspark.sql import Column
 from pyspark.sql import functions as F
@@ -181,7 +180,7 @@ def argmax(field, by, condition=None):
     if not isinstance(by, list):
         by = [by]
 
-    if isinstance(field, string_types):
+    if isinstance(field, str):
         field = F.col(field)
 
     by.append(field.alias('__tmp_argmax__'))

diff --git a/sparkly/session.py b/sparkly/session.py
@@ -242,7 +242,7 @@ def _setup_udfs(self):
             if isinstance(defn, str):
                 self.sql('create temporary function {} as "{}"'.format(name, defn))
             elif isinstance(defn, tuple):
-                self.catalog.registerFunction(name, *defn)
+                self.udf.register(name, *defn)
             else:
                 raise NotImplementedError('Incorrect UDF definition: {}: {}'.format(name, defn))
 

diff --git a/sparkly/writer.py b/sparkly/writer.py
@@ -50,7 +50,7 @@ class SparklyWriter(object):
     """
     def __init__(self, df):
         self._df = df
-        self._spark = df.sql_ctx.sparkSession
+        self._spark = df.sparkSession
 
     def by_url(self, url):
         """Write a dataframe to a destination specified by `url`.

diff --git a/tests/integration/base.py b/tests/integration/base.py
@@ -24,10 +24,10 @@
 
 class SparklyTestSession(SparklySession):
     packages = [
-        'com.datastax.spark:spark-cassandra-connector_2.11:2.4.0',
-        'org.elasticsearch:elasticsearch-spark-20_2.11:7.3.0',
-        'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0',
-        'mysql:mysql-connector-java:6.0.6',
+        'com.datastax.spark:spark-cassandra-connector_2.12:3.2.0',
+        'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8',
+        'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1',
+        'mysql:mysql-connector-java:8.0.31',
         'io.confluent:kafka-avro-serializer:3.0.1',
     ]
 
@@ -40,7 +40,7 @@ class SparklyTestSession(SparklySession):
     ]
 
     udfs = {
-        'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF',
+        'collect': 'brickhouse.udf.collect.CollectUDAF',
         'length_of_text': (lambda text: len(text), StringType())
     }