Skip to content

Commit

Permalink
Switch to py 3.10 and spark 3.3.1 (#92)
Browse files Browse the repository at this point in the history
- Switch our tests to Spark 3.3.1 and Python 3.10
- Drop `six` package cause we don't support Python 2.x anymore
- Replace Travis CI with Github Actions
- Make our test/dev env compatible with arm64
- Reduce the number of warnings
  • Loading branch information
drudim committed Dec 17, 2022
1 parent 015dbc8 commit 77e4c05
Show file tree
Hide file tree
Showing 21 changed files with 197 additions and 249 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/ci.yaml
@@ -0,0 +1,14 @@
name: CI

on:
push:
branches:
- master
pull_request:

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- run: make test
31 changes: 0 additions & 31 deletions .travis.yml

This file was deleted.

2 changes: 1 addition & 1 deletion Dockerfile
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
#

FROM python:3.7
FROM python:3.10

LABEL maintainer="dev@tubularlabs.com"

Expand Down
4 changes: 2 additions & 2 deletions README.rst
@@ -1,7 +1,7 @@
Sparkly
=======

|Sparkly PyPi Version| |Sparkly Build Status| |Documentation Status|
|Sparkly PyPi Version| |Documentation Status|

Helpers & syntax sugar for PySpark. There are several features to make your life easier:

Expand Down Expand Up @@ -35,7 +35,7 @@ and write its content to ElasticSearch index::
'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11',
'org.elasticsearch:elasticsearch-spark-20_2.11:6.5.4',
]


if __name__ == '__main__':
spark = MySession()
Expand Down
24 changes: 9 additions & 15 deletions docker-compose.yml
Expand Up @@ -47,17 +47,12 @@ services:
condition: service_healthy

cassandra.docker:
image: cassandra:2.1.13
mem_limit: 500M
memswap_limit: 600M
environment:
MAX_HEAP_SIZE: 500M
HEAP_NEWSIZE: 200M
image: cassandra:4.1
healthcheck:
test: ps ax | grep cassandra
test: ["CMD-SHELL", "[ $$(nodetool statusgossip) = running ]"]

elastic.docker:
image: docker.elastic.co/elasticsearch/elasticsearch:7.3.0
image: docker.elastic.co/elasticsearch/elasticsearch:7.17.8
environment:
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
Expand All @@ -69,16 +64,15 @@ services:
retries: 20

mysql.docker:
image: mysql:5.7.11
image: mysql:8.0
environment:
MYSQL_DATABASE: sparkly_test
MYSQL_USER: root
MYSQL_ALLOW_EMPTY_PASSWORD: "yes"
healthcheck:
test: ps ax | grep mysql
test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]

kafka.docker:
image: confluentinc/cp-kafka:6.2.1
image: confluentinc/cp-kafka:7.3.0
depends_on:
zookeeper.docker:
condition: service_healthy
Expand All @@ -94,11 +88,11 @@ services:
test: ps ax | grep kafka

redis.docker:
image: redis:3.2.4
image: redis:7.0
expose:
- 6379
- "6379"
healthcheck:
test: ps ax | grep redis
test: ["CMD", "redis-cli", "ping"]

zookeeper.docker:
image: confluent/zookeeper
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Expand Up @@ -15,4 +15,3 @@
#

pylru==1.0.9
six>=1.10.0,<2.0
2 changes: 1 addition & 1 deletion requirements_extras.txt
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
#

cassandra-driver==3.7.1
cassandra-driver==3.25.0
PyMySQL==0.9.3
kafka-python==2.0.2
redis==2.10.5
Expand Down
12 changes: 10 additions & 2 deletions setup.py
Expand Up @@ -59,7 +59,7 @@

# See https://pypi.python.org/pypi?%3Aaction=list_classifiers
classifiers=[
'Development Status :: 3 - Alpha',
'Development Status :: 5 - Production/Stable',

# Indicate who your project is intended for
'Intended Audience :: Developers',
Expand All @@ -72,6 +72,8 @@
# that you indicate whether you support Python 2, Python 3 or both.
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.10',
],

# What does your project relate to?
Expand All @@ -90,6 +92,12 @@
install_requires=requirements,
extras_require={
'redis': ['redis>=2.10,<3', 'ujson>=1.33,<2'],
'test': ['cassandra-driver>=3.7,<3.8', 'PyMySQL>=0.7,<0.10', 'kafka-python>=2.0.2,<2.1', 'redis>=2.10,<3', 'ujson>=1.33,<2'],
'test': [
'cassandra-driver>=3.25,<3.26',
'PyMySQL>=0.7,<0.10',
'kafka-python>=2.0.2,<2.1',
'redis>=2.10,<3',
'ujson>=1.33,<2',
],
},
)
6 changes: 3 additions & 3 deletions sparkly/catalog.py
Expand Up @@ -234,15 +234,15 @@ def get_database_properties(self, db_name):
"""
properties = (
self._spark.sql('DESCRIBE DATABASE EXTENDED {}'.format(db_name))
.where(F.col('database_description_item') == 'Properties')
.select('database_description_value')
.where(F.col('info_name') == 'Properties')
.select('info_value')
.first()
)

parsed_properties = {}

if properties:
for name, value in read_db_properties_format(properties.database_description_value):
for name, value in read_db_properties_format(properties.info_value):
parsed_properties[name] = value

return parsed_properties
Expand Down
3 changes: 1 addition & 2 deletions sparkly/functions.py
Expand Up @@ -17,7 +17,6 @@
from collections import defaultdict
from functools import reduce
import operator
from six import string_types

from pyspark.sql import Column
from pyspark.sql import functions as F
Expand Down Expand Up @@ -181,7 +180,7 @@ def argmax(field, by, condition=None):
if not isinstance(by, list):
by = [by]

if isinstance(field, string_types):
if isinstance(field, str):
field = F.col(field)

by.append(field.alias('__tmp_argmax__'))
Expand Down
2 changes: 1 addition & 1 deletion sparkly/session.py
Expand Up @@ -242,7 +242,7 @@ def _setup_udfs(self):
if isinstance(defn, str):
self.sql('create temporary function {} as "{}"'.format(name, defn))
elif isinstance(defn, tuple):
self.catalog.registerFunction(name, *defn)
self.udf.register(name, *defn)
else:
raise NotImplementedError('Incorrect UDF definition: {}: {}'.format(name, defn))

Expand Down
2 changes: 1 addition & 1 deletion sparkly/writer.py
Expand Up @@ -50,7 +50,7 @@ class SparklyWriter(object):
"""
def __init__(self, df):
self._df = df
self._spark = df.sql_ctx.sparkSession
self._spark = df.sparkSession

def by_url(self, url):
"""Write a dataframe to a destination specified by `url`.
Expand Down
10 changes: 5 additions & 5 deletions tests/integration/base.py
Expand Up @@ -24,10 +24,10 @@

class SparklyTestSession(SparklySession):
packages = [
'com.datastax.spark:spark-cassandra-connector_2.11:2.4.0',
'org.elasticsearch:elasticsearch-spark-20_2.11:7.3.0',
'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0',
'mysql:mysql-connector-java:6.0.6',
'com.datastax.spark:spark-cassandra-connector_2.12:3.2.0',
'org.elasticsearch:elasticsearch-spark-30_2.12:7.17.8',
'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1',
'mysql:mysql-connector-java:8.0.31',
'io.confluent:kafka-avro-serializer:3.0.1',
]

Expand All @@ -40,7 +40,7 @@ class SparklyTestSession(SparklySession):
]

udfs = {
'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF',
'collect': 'brickhouse.udf.collect.CollectUDAF',
'length_of_text': (lambda text: len(text), StringType())
}

Expand Down

0 comments on commit 77e4c05

Please sign in to comment.