In [1]:
### Creating the Session

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(
    "EDA of commercial ratio of each channel present in our commericals dataset (structured data)"
).getOrCreate()

## Setting the Log level to WARN instead of INFO
spark.sparkContext.setLogLevel("WARN")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/09 20:36:07 WARN Utils: Your hostname, OnePiece, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/01/09 20:36:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/09 20:36:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/09 20:36:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [10]:
import os
file_path = "../../F2_Tabular_Data/Data/BroadcastLogs_2018_Q3_M8_sample.CSV"

logs = spark.read.csv(
    path = file_path,
    sep = "|",
    header = True,
    inferSchema = True,
    timestampFormat = "yyyy-MM-dd",
)


    

                                                                                

In [11]:
logs.printSchema()

root
 |-- BroadcastLogID: integer (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: date (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: date (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string (nullable 

In [13]:
### Selecting the columns of our interest

logs.select("BroadcastLogID", "LogServiceID", "LogDate").show(5, False)

+--------------+------------+----------+
|BroadcastLogID|LogServiceID|LogDate   |
+--------------+------------+----------+
|1196192316    |3157        |2018-08-01|
|1196192317    |3157        |2018-08-01|
|1196192318    |3157        |2018-08-01|
|1196192319    |3157        |2018-08-01|
|1196192320    |3157        |2018-08-01|
+--------------+------------+----------+
only showing top 5 rows


In [19]:
### Peeking at the DF in chunks of 3 cols

import numpy as np
column_split = np.array_split(
    np.array(logs.columns),
    len(logs.columns) // 3
)

print(column_split)

[array(['BroadcastLogID', 'LogServiceID', 'LogDate'], dtype='<U22'), array(['SequenceNO', 'AudienceTargetAgeID', 'AudienceTargetEthnicID'],
      dtype='<U22'), array(['CategoryID', 'ClosedCaptionID', 'CountryOfOriginID'], dtype='<U22'), array(['DubDramaCreditID', 'EthnicProgramID', 'ProductionSourceID'],
      dtype='<U22'), array(['ProgramClassID', 'FilmClassificationID', 'ExhibitionID'],
      dtype='<U22'), array(['Duration', 'EndTime', 'LogEntryDate'], dtype='<U22'), array(['ProductionNO', 'ProgramTitle', 'StartTime'], dtype='<U22'), array(['Subtitle', 'NetworkAffiliationID', 'SpecialAttentionID'],
      dtype='<U22'), array(['BroadcastOriginPointID', 'CompositionID', 'Producer1'],
      dtype='<U22'), array(['Producer2', 'Language1', 'Language2'], dtype='<U22')]


In [20]:
logs = logs.drop("BroadcastLogID", "SequenceNo")

print("BroadcastLogID" in logs.columns)
print("SequenceNo" in logs.columns)

False
False


In [21]:
### Dropping unnecessary columns

logs = logs.select(
    [x for x in logs.columns if x not in ["BroadcastLogID", "SequenceNO"]]
)

logs.drop(*logs.columns) # Unpacking the list and dropping them

DataFrame[]

In [25]:
### Creating new columns
from pyspark.sql.functions import col
logs.select(col("Duration")).show(5)
print(logs.select(col("Duration")).dtypes)

+----------------+
|        Duration|
+----------------+
|02:00:00.0000000|
|00:00:30.0000000|
|00:00:15.0000000|
|00:00:15.0000000|
|00:00:15.0000000|
+----------------+
only showing top 5 rows
[('Duration', 'string')]


In [None]:
### Extracting hours, minutes and seconds from the Duration col
from pyspark.sql.functions as F
logs.select(F.col("Duration"),                                                
    F.col("Duration").substr(1, 2).cast("int").alias("dur_hours"),    # substr(start_pt, length
    F.col("Duration").substr(4, 2).cast("int").alias("dur_minutes"),  
    F.col("Duration").substr(7, 2).cast("int").alias("dur_seconds"),  
).distinct().show(                                                    
    5
)