<a href="https://colab.research.google.com/github/RotemBorenstein/Pyspark-Databricks-Project/blob/main/project1_part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import split, array_contains, col, expr, to_date, dayofweek, when, countDistinct, desc, count, avg, lower, explode, max, collect_list, lit
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
import re
import pyspark.sql.functions as F


spark = SparkSession.builder.appName("my_project_part_2").getOrCreate()
sc = spark.sparkContext

In [None]:
# Read a CSV into a dataframe
# There is a smarter version, that will first check if there is a Parquet file and use it
def load_csv_file(filename, schema):
  # Reads the relevant file from distributed file system using the given schema

  allowed_files = {'Daily program data': ('Daily program data', "|"),
                   'demographic': ('demographic', "|")}

  if filename not in allowed_files.keys():
    print(f'You were trying to access unknown file \"{filename}\". Only valid options are {allowed_files.keys()}')
    return None

  filepath = allowed_files[filename][0]
  dataPath = f"dbfs:/mnt/coursedata2024/fwm-stb-data/{filepath}"
  delimiter = allowed_files[filename][1]

  df = spark.read.format("csv")\
    .option("header","false")\
    .option("delimiter",delimiter)\
    .schema(schema)\
    .load(dataPath)
  return df

# This dict holds the correct schemata for easily loading the CSVs
schemas_dict = {'Daily program data':
                  StructType([
                    StructField('prog_code', StringType()),
                    StructField('title', StringType()),
                    StructField('genre', StringType()),
                    StructField('air_date', StringType()),
                    StructField('air_time', StringType()),
                    StructField('Duration', FloatType())
                  ]),
                'viewing':
                  StructType([
                    StructField('device_id', StringType()),
                    StructField('event_date', StringType()),
                    StructField('event_time', IntegerType()),
                    StructField('mso_code', StringType()),
                    StructField('prog_code', StringType()),
                    StructField('station_num', StringType())
                  ]),
                'viewing_full':
                  StructType([
                    StructField('mso_code', StringType()),
                    StructField('device_id', StringType()),
                    StructField('event_date', IntegerType()),
                    StructField('event_time', IntegerType()),
                    StructField('station_num', StringType()),
                    StructField('prog_code', StringType())
                  ]),
                'demographic':
                  StructType([StructField('household_id',StringType()),
                    StructField('household_size',IntegerType()),
                    StructField('num_adults',IntegerType()),
                    StructField('num_generations',IntegerType()),
                    StructField('adult_range',StringType()),
                    StructField('marital_status',StringType()),
                    StructField('race_code',StringType()),
                    StructField('presence_children',StringType()),
                    StructField('num_children',IntegerType()),
                    StructField('age_children',StringType()), #format like range - 'bitwise'
                    StructField('age_range_children',StringType()),
                    StructField('dwelling_type',StringType()),
                    StructField('home_owner_status',StringType()),
                    StructField('length_residence',IntegerType()),
                    StructField('home_market_value',StringType()),
                    StructField('num_vehicles',IntegerType()),
                    StructField('vehicle_make',StringType()),
                    StructField('vehicle_model',StringType()),
                    StructField('vehicle_year',IntegerType()),
                    StructField('net_worth',IntegerType()),
                    StructField('income',StringType()),
                    StructField('gender_individual',StringType()),
                    StructField('age_individual',IntegerType()),
                    StructField('education_highest',StringType()),
                    StructField('occupation_highest',StringType()),
                    StructField('education_1',StringType()),
                    StructField('occupation_1',StringType()),
                    StructField('age_2',IntegerType()),
                    StructField('education_2',StringType()),
                    StructField('occupation_2',StringType()),
                    StructField('age_3',IntegerType()),
                    StructField('education_3',StringType()),
                    StructField('occupation_3',StringType()),
                    StructField('age_4',IntegerType()),
                    StructField('education_4',StringType()),
                    StructField('occupation_4',StringType()),
                    StructField('age_5',IntegerType()),
                    StructField('education_5',StringType()),
                    StructField('occupation_5',StringType()),
                    StructField('polit_party_regist',StringType()),
                    StructField('polit_party_input',StringType()),
                    StructField('household_clusters',StringType()),
                    StructField('insurance_groups',StringType()),
                    StructField('financial_groups',StringType()),
                    StructField('green_living',StringType())
                  ])
}

In [None]:
# load Demographic Data
demo_df = load_csv_file('demographic', schemas_dict['demographic'])

# convert letters to appropriate numbers
mapping = {'A' : 10, 'B': 11, 'C': 12, 'D': 13}
demo_df = demo_df.withColumn("income",
            when(col("income") == "A", mapping['A']).
             when(col("income") == "B", mapping['B']).
             when(col("income") == "C", mapping['C']).
             when(col("income") == "D", mapping['D']).
             otherwise(col("income")))
demo_df = demo_df.withColumn("income", col("income").cast("int"))


In [None]:
# load Daily program data
daily_prog_df = load_csv_file('Daily program data', schemas_dict['Daily program data'])

# convert genre to array of strings
daily_prog_df = daily_prog_df.withColumn("genre", split(col("genre"), ",\s*"))


In [None]:
# load program viewing data
dataPath = f"dbfs:/viewing_10M"
viewing10m_df = spark.read.format("csv")\
    .option("header","true")\
    .option("delimiter",",")\
    .schema(schemas_dict['viewing_full'])\
    .load(dataPath)


In [None]:
#load refrence data
ref_df = spark.read.parquet('dbfs:/refxml_new_parquet')

### part B.1

In [None]:
# clean data
daily_prog_df_cleaned_B1 = daily_prog_df.select("genre", "prog_code").dropna(subset=["genre", "prog_code"]).dropDuplicates(subset=["genre", "prog_code"])
ref_df_cleaned_B1 = ref_df.select("device_id", "dma").dropna(subset=["device_id", "dma"]).dropDuplicates(subset=["device_id", "dma"])
viewing10m_df_cleaned_B1 = viewing10m_df.select("device_id", "prog_code").dropna(subset=["device_id", "prog_code"]).dropDuplicates(subset=["device_id", "prog_code"])

# explode the 'genre' column
exploded_daily_prog_df = daily_prog_df_cleaned_B1.withColumn("genre", explode("genre"))

# join and select necessary columns
ref_df_cleaned_B1_filtered = ref_df_cleaned_B1.filter(col("dma") != "Unknown")
ref_and_viewing = ref_df_cleaned_B1_filtered.join(viewing10m_df_cleaned_B1, on="device_id").select("device_id", "prog_code", "dma")
ref_viewing_daily = ref_and_viewing.join(exploded_daily_prog_df, on="prog_code").select("device_id", "prog_code", "dma", "genre")

# cache ref_viewing_daily because we will reused multiple times
ref_viewing_daily.cache()

# get the top 10 DMAs by device amount
largest_dma = ref_df_cleaned_B1_filtered.groupBy("dma").agg(count("device_id").alias("device_amount")).orderBy(col("device_amount").desc()).limit(10).select("dma")

# filter ref_viewing_daily to only include records with DMAs in largest_dma
ref_viewing_daily_largest_dma = ref_viewing_daily.join(largest_dma, on="dma")

# calculate the viewing entries per genre for each DMA
dma_populatity = ref_viewing_daily_largest_dma.groupBy("dma", "genre").agg(count("prog_code").alias("viewing_entries")).orderBy(col("viewing_entries").desc())


In [None]:
# get distinct DMAs
distinct_dmas = largest_dma.distinct().collect()
dma_list = [row["dma"] for row in distinct_dmas]

def clean_dma_name(dma):
    clean_dma = re.sub(r"\s", "_", dma)
    return re.sub(r'[^A-Za-z0-9_]', '', clean_dma.replace(' ', '_'))

# filter the DataFrame for each DMA and save to CSV
for dma in dma_list:
    cleaned_dma = clean_dma_name(dma)
    dma_filtered_df = dma_populatity.filter(dma_populatity["dma"] == dma)
    file_name = f"project1_part21_{cleaned_dma}_314689498_211620570.csv"
    dma_filtered_df.write.csv(
        path= file_name,
        mode="overwrite",
        header=True
    )
    print(f"Saved DataFrame for DMA: {dma} to {file_name}")

# uncache ref_viewing_daily
ref_viewing_daily.unpersist()

Saved DataFrame for DMA: Charleston-Huntington to project1_part21_CharlestonHuntington_314689498_211620570.csv
Saved DataFrame for DMA: Wilkes Barre-Scranton-Hztn to project1_part21_Wilkes_BarreScrantonHztn_314689498_211620570.csv
Saved DataFrame for DMA: Seattle-Tacoma to project1_part21_SeattleTacoma_314689498_211620570.csv
Saved DataFrame for DMA: Toledo to project1_part21_Toledo_314689498_211620570.csv
Saved DataFrame for DMA: Little Rock-Pine Bluff to project1_part21_Little_RockPine_Bluff_314689498_211620570.csv
Saved DataFrame for DMA: Amarillo to project1_part21_Amarillo_314689498_211620570.csv
Saved DataFrame for DMA: Bend, OR to project1_part21_Bend_OR_314689498_211620570.csv
Saved DataFrame for DMA: Greenville-N.Bern-Washngtn to project1_part21_GreenvilleNBernWashngtn_314689498_211620570.csv
Saved DataFrame for DMA: Washington, DC (Hagrstwn) to project1_part21_Washington_DC_Hagrstwn_314689498_211620570.csv
Saved DataFrame for DMA: Houston to project1_part21_Houston_314689498_

DataFrame[device_id: string, prog_code: string, dma: string, genre: string]

In [None]:
dma_populatity.cache()

# Function to display the top 10 genres for a specific DMA
def display_top_genres_for_dma(dma, dma_populatity, title):
    print(f"# {title} - {dma}")
    top_genres = dma_populatity.filter(col("dma") == dma).orderBy(col("viewing_entries").desc()).limit(10).select("genre")
    top_genres.show(truncate = False)

In [None]:
# Display top 10 genres for the 1st, 5th, and 9th DMAs by size
display_top_genres_for_dma(dma_list[0], dma_populatity, "Top 10 Genres for the 1st Largest DMA")

# Top 10 Genres for the 1st Largest DMA - Charleston-Huntington
+-----------+
|genre      |
+-----------+
|Reality    |
|News       |
|Sitcom     |
|Drama      |
|Talk       |
|Documentary|
|Children   |
|Adventure  |
|Comedy     |
|Animated   |
+-----------+



In [None]:
display_top_genres_for_dma(dma_list[4], dma_populatity, "Top 10 Genres for the 5th Largest DMA")

# Top 10 Genres for the 5th Largest DMA - Little Rock-Pine Bluff
+-----------+
|genre      |
+-----------+
|Reality    |
|News       |
|Sitcom     |
|Talk       |
|Drama      |
|Comedy     |
|Documentary|
|Crime drama|
|Adventure  |
|Children   |
+-----------+



In [None]:
display_top_genres_for_dma(dma_list[8], dma_populatity, "Top 10 Genres for the 9th Largest DMA")

# Top 10 Genres for the 9th Largest DMA - Washington, DC (Hagrstwn)
+-----------+
|genre      |
+-----------+
|Reality    |
|News       |
|Sitcom     |
|Comedy     |
|Children   |
|Drama      |
|Talk       |
|Animated   |
|Crime drama|
|Adventure  |
+-----------+



In [None]:
dma_populatity.unpersist()

DataFrame[dma: string, genre: string, viewing_entries: bigint]

### part B.2

In [None]:
# clean data
demo_df_cleaned_B2 = demo_df.dropna(
    subset=["income", "net_worth", "household_id"]
).dropDuplicates(subset=["income", "net_worth", "household_id"])

daily_prog_df_cleaned_B2 = daily_prog_df.select("genre", "prog_code").dropna(
    subset=["genre", "prog_code"]
).dropDuplicates(subset=["genre", "prog_code"])

ref_df_cleaned_B2 = ref_df.select("device_id", "household_id", "dma").dropna(
    subset=["device_id", "household_id", "dma"]
).dropDuplicates(subset=["device_id", "dma"])

viewing10m_df_cleaned_B2 = viewing10m_df.select("device_id", "prog_code").dropna(
    subset=["device_id", "prog_code"]
).dropDuplicates(subset=["device_id", "prog_code"])

# explode the 'genre' column
daily_prog_df_cleaned_B2 = daily_prog_df_cleaned_B2.withColumn("genre", explode("genre"))

In [None]:
# calculate wealth score for each DMA
max_income = demo_df_cleaned_B2.agg(max("income")).collect()[0][0]
max_net_worth = demo_df_cleaned_B2.agg(max("net_worth")).collect()[0][0]

dma_wealth_score = (
    demo_df_cleaned_B2
    .join(ref_df_cleaned_B2, on="household_id")
    .groupBy("dma")
    .agg(
        avg("net_worth").alias("avg_net_worth"),
        avg("income").alias("avg_income")
    )
    .withColumn("wealth_score", (col("avg_net_worth") / max_net_worth) + (col("avg_income") / max_income))
)

In [None]:
# get 10 Wealthiest DMA
top_wealthy_dmas = dma_wealth_score.orderBy(desc("wealth_score")).limit(10)

In [None]:
# join the viewing data with the program data to get the genres
viewing_with_genres = viewing10m_df_cleaned_B2.join(daily_prog_df_cleaned_B2, on="prog_code")

# count the occurrences of each genre within each DMA
genre_popularity = (
    viewing_with_genres
    .join(ref_df_cleaned_B2, on="device_id")
    .groupBy("dma", "genre")
    .agg(count("prog_code").alias("viewing_count"))
    .orderBy(desc("viewing_count"))
)

# create a list to hold the final results
results = []

# track used genres
used_genres = set()

# function to clean DMA name for file naming
def clean_dma_name(dma_name):
    return re.sub(r'[^A-Za-z0-9_]', '', dma_name.replace(' ', '_'))

# cache genre_popularity because we will reused multiple times
genre_popularity.cache()

# process each DMA in the order of wealth
for idx, dma_row in enumerate(top_wealthy_dmas.collect()):
    dma = dma_row["dma"]
    wealth_score = dma_row["wealth_score"]

    # get the top genres for the current DMA excluding already used genres
    top_genres = (
        genre_popularity
        .filter(col("dma") == dma)
        .filter(~col("genre").isin(list(used_genres)))
        .orderBy(desc("viewing_count"))
        .select("genre", "dma", "viewing_count")
        .limit(11)
    )

    if idx in [0, 4, 8]:
        print(f"### Top 11 Genres for the {idx + 1}th Wealthiest DMA: {dma} (Wealth Score: {round(wealth_score, 3)}) ###")
        top_genres.select("genre").show(truncate=False)

    top_genres_list = [row["genre"] for row in top_genres.collect()]

    # add the genres to the used set
    used_genres.update(top_genres_list)

    file_name = f"project1_part22_{dma}_314689498_211620570.csv"
    single_dma_df = top_genres.select("dma", "genre").join(top_wealthy_dmas.select("dma", "wealth_score"), on="dma")
    single_dma_df.withColumnRenamed("dma", "DMA NAME")
    single_dma_df.withColumnRenamed("wealth_score", "WEALTH SCORE")
    single_dma_df.withColumnRenamed("genre", "ORDERED LIST OF GENRES")
    single_dma_df.write.csv(file_name, header=True, mode="overwrite")

genre_popularity.unpersist()

### Top 11 Genres for the 1th Wealthiest DMA: San Antonio (Wealth Score: 1.624) ###
+------------+
|genre       |
+------------+
|News        |
|Weather     |
|Sitcom      |
|Talk        |
|Drama       |
|Newsmagazine|
|Western     |
|Comedy      |
|Cooking     |
|Reality     |
|Auto        |
+------------+

### Top 11 Genres for the 5th Wealthiest DMA: Bend, OR (Wealth Score: 1.457) ###
+--------------+
|genre         |
+--------------+
|Outdoors      |
|Bus./financial|
|History       |
|Science       |
|How-to        |
|Animals       |
|Nature        |
|Medical       |
|Golf          |
|Playoff sports|
|Paranormal    |
+--------------+

### Top 11 Genres for the 9th Wealthiest DMA: Seattle-Tacoma (Wealth Score: 1.416) ###
+----------------+
|genre           |
+----------------+
|Pro wrestling   |
|Martial arts    |
|Action sports   |
|Parenting       |
|Poker           |
|Card games      |
|Aviation        |
|Military        |
|Self improvement|
|Anime           |
|Softball        |


DataFrame[dma: string, genre: string, viewing_count: bigint]