In [1]:
import os

import findspark
findspark.init()

from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as psf
from pyspark.sql.utils import AnalysisException

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell'

In [2]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.session.timeZone', 'UTC')
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [3]:
# paths to read data 
parquets_dir = 'parquets/'
data_dir = 'parquetsmarks/'

In [4]:
# create a list which contains all the dfs to be unified 
df_list = [spark.read.parquet(parquets_dir + data_dir + path ) \
           for path in os.listdir(parquets_dir + data_dir) if path != ".DS_Store"]

In [5]:
# funtion that returns the union of all the dfs 
def unionAll(*dfs):
    first, rest = dfs[0], dfs[1:]  # Python 3.x, for 2.x you'll have to unpack manually
    return first.sql_ctx.createDataFrame(
        first.sql_ctx._sc.union([df.rdd for df in dfs]),
        first.schema
    )

In [6]:
def unionAll(df_list):
    df_union = df_list[0]
    for df in df_list[1:]: df_union=df_union.unionAll(df)
    return df_union

In [7]:
df_to_write = unionAll(df_list)

In [8]:
# check number of rows
df_to_write.count()

4275

In [9]:
# write the parquet
FILENAME = 'films2'
df_to_write.write.mode('overwrite').parquet(parquets_dir + FILENAME + '.parquet' )

In [10]:
df_test = spark.read.parquet(parquets_dir + FILENAME + '.parquet' )

In [11]:
df_test.count()

4275

In [12]:
df_test.show()

+---------+-------------+----+------+------------------+--------------------+
|   tconst|num_subtitles|year|blocks|     subtitle_mins|           subtitles|
+---------+-------------+----+------+------------------+--------------------+
|tt5275892|         6236|2016|  1829|            406.05|[[As, a, kid, gro...|
|tt2318527|         7083|2013|  2793| 32.46666666666667|[[[, BELL, TOLLIN...|
|tt2234155|         3562|2013|  3348|63.166666666666664|[[#, An, old, man...|
|tt2404463|         4078|2013|  3356| 60.11666666666667|[[Cleaned, correc...|
|tt1398426|         3729|2015|  2768| 86.06666666666666|[[[, Police, Radi...|
|tt3311384|         1993|2013|  2116|59.016666666666666|[[(, CROWD, CHEER...|
|tt2080374|         2889|2015|  2309|             62.15|[[MAN, #, 1, :], ...|
|tt4257858|         1553|2015|  2415|              58.8|[[We, will, begin...|
|tt4540710|         2018|2016|  1668| 67.23333333333333|[[Subtitle, made,...|
|tt3152624|         3926|2015|  2508| 62.78333333333333|[[Girls,