In [4]:
import os
import findspark
from pyspark.sql import SparkSession
from pyspark import pandas
from pyspark.sql.functions import col, input_file_name, when, regexp_extract, lit


findspark.init()
pandas.set_option('display.max_rows', 10000)


In [2]:
spark = SparkSession.builder.master('local').appName(
    name='councils'
).getOrCreate()

24/03/25 22:13:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
# council_types = {
#     "district_councils.csv": "District Council",
#     "london_boroughs.csv": "London Borough",
#     "metropolitan_districts.csv": "Metropolitan District",
#     "unitary_authorities.csv": "Unitary Authority"
# }

# councils_df = councils_df.withColumn("council_type",  regexp_extract(input_file_name(), r'[^/]+(?=\.csv)', 0))

# councils_df = councils_df.withColumn("council_type", 
#    when(councils_df["council_type"] == "district_councils", "A")
#    .when(councils_df["council_type"] == "london_boroughs", "B")
#    .when(councils_df["council_type"] == "metropolitan_districts", "C")
#    .when(councils_df["council_type"] == "unitary_authorities.csv", "D")
#    .otherwise("Unknown")
# )


district_councils = spark.read.option('header', 'true').csv(
    './resources/england_councils/district_councils.csv'
).withColumn('council_type', lit("District Council"))

london_boroughs = spark.read.option('header', 'true').csv(
    './resources/england_councils/london_boroughs.csv'
).withColumn('council_type', lit("London Borough"))

metropolitan_districts = spark.read.option('header', 'true').csv(
    './resources/england_councils/metropolitan_districts.csv'
).withColumn('council_type', lit("Metropolitan District"))

unitary_authorities = spark.read.option('header', 'true').csv(
    './resources/england_councils/unitary_authorities.csv'
).withColumn('council_type', lit("Unitary Authority"))




In [None]:
district_councils.

In [None]:
councils_df = district_councils.union(london_boroughs).union(metropolitan_districts).union(unitary_authorities).distinct()

In [182]:
councils_df.count()

316

In [183]:
prop_price_df = spark.read.option(
    'header', 'true'
).csv(
    './resources/property_avg_price.csv'
).selectExpr(
    'local_authority as council', 'avg_price_nov_2019'
).distinct()

In [184]:
prop_price_df.count()

353

In [185]:
sales_vol_df = spark.read.option(
    'header', 'true'
).csv(
    './resources/property_sales_volume.csv'
).selectExpr(
    'local_authority as council', 'sales_volume_sep_2019'
).distinct()

In [186]:
sales_vol_df.count()

353

In [None]:
joined_df = councils_df.join(
    prop_price_df, on='council', how='left_outer'
).join(
    sales_vol_df, on='council', how='left_outer'
)
joined_df.show(joined_df.count())

In [73]:
data1 = [("John", 25), ("Alice", 30), ("Bob", 35), ("Ala", 50)]
df1 = spark.createDataFrame(data1, ["name", "age"])

data2 = [("John", "New York"), ("Alice", "Los Angeles"), ("Bob", "Chicago"), ("Oveys", "Rasht")]
df2 = spark.createDataFrame(data2, ["name", "city"])

# Inner join
inner_join_df = df1.join(df2, on="name", how="inner")
print("INNER")
inner_join_df.show()

# Left outer join
left_outer_join_df = df1.join(df2, on="name", how="left")
print("Left Outer")
left_outer_join_df.show()

# Right outer join
right_outer_join_df = df1.join(df2, on="name", how="right")
print("Right Outer")
right_outer_join_df.show()

# Full outer join
full_outer_join_df = df1.join(df2, on="name", how="full")
print("Full Outer")
full_outer_join_df.show()

# Left semi join
left_semi_join_df = df1.join(df2, on="name", how="left_semi")

left_semi_join_df.show()

# Left anti join
left_anti_join_df = df1.join(df2, on="name", how="left_anti")
left_anti_join_df.show()


INNER
+-----+---+-----------+
| name|age|       city|
+-----+---+-----------+
|Alice| 30|Los Angeles|
|  Bob| 35|    Chicago|
| John| 25|   New York|
+-----+---+-----------+

Left Outer
+-----+---+-----------+
| name|age|       city|
+-----+---+-----------+
| John| 25|   New York|
|Alice| 30|Los Angeles|
|  Bob| 35|    Chicago|
|  Ala| 50|       NULL|
+-----+---+-----------+

Right Outer


                                                                                

+-----+----+-----------+
| name| age|       city|
+-----+----+-----------+
| John|  25|   New York|
|Alice|  30|Los Angeles|
|  Bob|  35|    Chicago|
|Oveys|NULL|      Rasht|
+-----+----+-----------+

Full Outer
+-----+----+-----------+
| name| age|       city|
+-----+----+-----------+
|  Ala|  50|       NULL|
|Alice|  30|Los Angeles|
|  Bob|  35|    Chicago|
| John|  25|   New York|
|Oveys|NULL|      Rasht|
+-----+----+-----------+



                                                                                

+-----+---+
| name|age|
+-----+---+
|Alice| 30|
|  Bob| 35|
| John| 25|
+-----+---+

+----+---+
|name|age|
+----+---+
| Ala| 50|
+----+---+



                                                                                