In [79]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("GoT").getOrCreate()

In [80]:
path1 = '/Users/romanov.ana/PycharmProjects/spark/resources/1_df_files/subtitles_s1.json'
path2 = '/Users/romanov.ana/PycharmProjects/spark/resources/1_df_files/subtitles_s2.json'

In [81]:
df1 = spark.read.option("inferSchema", True).csv(path1, sep='\u0001')
df1 = df1.withColumnRenamed("_c0", "data")
df2 = spark.read.option("inferSchema", True).csv(path2, sep='\u0001')
df2 = df2.withColumnRenamed("_c0", "data")

In [82]:
df_exploded_1 = df1.withColumn("data", F.split(F.col("data"), '\\W+')).select(F.explode("data").alias("element"))
df_exploded_2 = df2.withColumn("data", F.split(F.col("data"), '\\W+')).select(F.explode("data").alias("element"))

In [87]:
from pyspark.sql import DataFrame


def custom_filter(df: DataFrame, limit = 10) -> DataFrame:
    return df.groupBy(F.col("element")) \
    .count() \
    .filter((F.col("element") != '') & F.col("element").try_cast(IntegerType()).isNull()) \
    .orderBy(F.col("count").desc()).limit(limit)

In [90]:
word_cnt_df1 = custom_filter(df_exploded_1).withColumn("index", F.monotonically_increasing_id())
word_cnt_df2 = custom_filter(df_exploded_1).withColumn("index", F.monotonically_increasing_id())

In [91]:
result_df = word_cnt_df1.join(word_cnt_df2, word_cnt_df1.index == word_cnt_df2.index, how='full')
result_df.show(truncate=False)

+-------+-----+-----+-------+-----+-----+
|element|count|index|element|count|index|
+-------+-----+-----+-------+-----+-----+
|the    |1684 |0    |the    |1684 |0    |
|I      |1544 |1    |I      |1544 |1    |
|you    |1365 |2    |you    |1365 |2    |
|to     |1154 |3    |to     |1154 |3    |
|a      |892  |4    |a      |892  |4    |
|s      |758  |5    |s      |758  |5    |
|of     |731  |6    |of     |731  |6    |
|and    |624  |7    |and    |624  |7    |
|my     |507  |8    |my     |507  |8    |
|t      |488  |9    |t      |488  |9    |
+-------+-----+-----+-------+-----+-----+

