In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Bucket Join").master("local[*]").getOrCreate();
spark

In [2]:
df1 = spark.read.csv("/opt/data/ac_subscriber.csv", header=True, inferSchema=True)
# df2 = spark.read.csv("/opt/data/ac_acct.csv", header=True, inferSchema=True)

# joined = df1.join(df2, on=["subscriber_id", "acct_id"], how="left")
# joined.explain(True)



In [3]:
from pyspark.sql.functions import broadcast

joined_broadcast = df1.join(broadcast(df2), on=["subscriber_id", "acct_id"], how="inner")
joined_broadcast.explain(True)


== Parsed Logical Plan ==
'Join UsingJoin(Inner, [subscriber_id, acct_id])
:- Relation [company_id#17,subscriber_id#18L,acct_id#19L,acct_type#20,profile_acct#21,guarantor_id#22,profile_id#23,activation_date#24,first_inv_date#25,account_desc#26,pin#27,secret_question#28,active_services#29,num_active_srv#30,num_tempdeact_srv#31,num_deact_srv#32,created_by_session#33L,modify_by_session#34,logically_deleted#35,row_status#36,row_created_time#37,row_modify_time#38] csv
+- ResolvedHint (strategy=broadcast)
   +- Relation [company_id#78,subscriber_id#79L,acct_id#80L,acct_type#81,profile_acct#82,guarantor_id#83,profile_id#84,activation_date#85,first_inv_date#86,account_desc#87,pin#88,secret_question#89,active_services#90,num_active_srv#91,num_tempdeact_srv#92,num_deact_srv#93,created_by_session#94L,modify_by_session#95,logically_deleted#96,row_status#97,row_created_time#98,row_modify_time#99] csv

== Analyzed Logical Plan ==
subscriber_id: bigint, acct_id: bigint, company_id: int, acct_type: st

In [4]:
joined = df1.join(df2, on=["subscriber_id", "acct_id"], how="left")
joined.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [subscriber_id, acct_id])
:- Relation [company_id#17,subscriber_id#18L,acct_id#19L,acct_type#20,profile_acct#21,guarantor_id#22,profile_id#23,activation_date#24,first_inv_date#25,account_desc#26,pin#27,secret_question#28,active_services#29,num_active_srv#30,num_tempdeact_srv#31,num_deact_srv#32,created_by_session#33L,modify_by_session#34,logically_deleted#35,row_status#36,row_created_time#37,row_modify_time#38] csv
+- Relation [company_id#78,subscriber_id#79L,acct_id#80L,acct_type#81,profile_acct#82,guarantor_id#83,profile_id#84,activation_date#85,first_inv_date#86,account_desc#87,pin#88,secret_question#89,active_services#90,num_active_srv#91,num_tempdeact_srv#92,num_deact_srv#93,created_by_session#94L,modify_by_session#95,logically_deleted#96,row_status#97,row_created_time#98,row_modify_time#99] csv

== Analyzed Logical Plan ==
subscriber_id: bigint, acct_id: bigint, company_id: int, acct_type: string, profile_acct: string, guaranto

In [5]:
from pyspark.sql import functions as F

def skew_ratio(df, col):
    counts = df.groupBy(col).count()
    stats = counts.agg(
        F.max("count").alias("max_count"),
        F.avg("count").alias("avg_count"),
        F.count("count").alias("unique_values")
    ).collect()[0]
    
    skew = stats["max_count"] / stats["avg_count"] if stats["avg_count"] > 0 else 0
    return {
        "column": col,
        "unique_values": stats["unique_values"],
        "max_count": stats["max_count"],
        "avg_count": stats["avg_count"],
        "skew_ratio": round(skew, 2)
    }

# Run for every column
results = []
for col in df1.columns:
    results.append(skew_ratio(df1, col))

# Show results
for r in results:
    print(r)

{'column': 'company_id', 'unique_values': 2, 'max_count': 125476, 'avg_count': 62791.0, 'skew_ratio': 2.0}
{'column': 'subscriber_id', 'unique_values': 101403, 'max_count': 290, 'avg_count': 1.2384446219539855, 'skew_ratio': 234.16}
{'column': 'acct_id', 'unique_values': 125582, 'max_count': 1, 'avg_count': 1.0, 'skew_ratio': 1.0}
{'column': 'acct_type', 'unique_values': 2, 'max_count': 106912, 'avg_count': 62791.0, 'skew_ratio': 1.7}
{'column': 'profile_acct', 'unique_values': 2, 'max_count': 101048, 'avg_count': 62791.0, 'skew_ratio': 1.61}
{'column': 'guarantor_id', 'unique_values': 100768, 'max_count': 763, 'avg_count': 1.2462488091457606, 'skew_ratio': 612.24}
{'column': 'profile_id', 'unique_values': 101049, 'max_count': 763, 'avg_count': 1.242783204188067, 'skew_ratio': 613.94}
{'column': 'activation_date', 'unique_values': 6270, 'max_count': 275, 'avg_count': 20.02902711323764, 'skew_ratio': 13.73}
{'column': 'first_inv_date', 'unique_values': 3175, 'max_count': 3388, 'avg_coun

In [4]:
from pyspark.sql import functions as F

grouped = df1.groupBy("pin").agg(F.count("*").alias("cnt"))
grouped.show()

+----+---+
| pin|cnt|
+----+---+
|5925|  2|
|1436|  2|
| 467|  1|
|9586|  1|
|9009|  1|
|2069|  1|
|2088|  4|
|7252|  1|
| 829|  2|
|7743|  2|
|2464|  2|
|3517|  2|
|5067|  1|
|3858|  2|
|2110|  3|
|7655|  2|
|9975|  1|
|8306|  1|
|1746|  2|
| 919|  1|
+----+---+
only showing top 20 rows

