## Schema Manipulation and Data Treatment and Handling missing data


In [5]:

from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import Column
from pyspark.sql.functions import lower 

schema = StructType([    
        StructField("user_id", IntegerType()),
        StructField("device_id", IntegerType()),    
        StructField("referrer", StringType()),
        StructField("host", StringType()),
        StructField("url", StringType()),
        StructField("event_time", StringType())])

event_data_frame = spark.read.csv("file:///data/db1/events.csv", sep=',', header=True, schema=schema)

In [None]:
import pyspark.sql.functions as F

event_data_frame = event_data_frame.withColumn("host2", F.upper(F.col("host"))) 

event_data_frame.select(event_data_frame["user_id"]).distinct().show(100, truncate=False)





+-----------+
|user_id    |
+-----------+
|199518013  |
|1139779140 |
|609206481  |
|2043294632 |
|1006832795 |
|-1328864031|
|1596711851 |
|948080397  |
|253520547  |
|-108163679 |
|1996955565 |
|-1747569163|
|-1182240256|
|-331912870 |
|-1750424248|
|1554102586 |
|1772238911 |
|1364390164 |
|526294848  |
|1134392022 |
|-383067206 |
|71328380   |
|337048478  |
|-381592034 |
|1228068943 |
|-1811488834|
|-1086902042|
|1453056494 |
|1534350843 |
|-1212633653|
|2028095249 |
|-379744838 |
|-766167741 |
|910064595  |
|783824117  |
|-577266490 |
|1088638720 |
|-880864061 |
|-1565042122|
|1591461470 |
|-1606188214|
|2036973230 |
|-747914771 |
|1851532831 |
|-990676981 |
|-1770988438|
|-152931845 |
|-1142111672|
|1981387696 |
|-59188779  |
|-740313814 |
|1813730319 |
|855850004  |
|1810623733 |
|1824397585 |
|-1141822467|
|-311530106 |
|789787836  |
|-2098603255|
|-1214517882|
|-1748210008|
|1918650158 |
|625282531  |
|1428028120 |
|-1064558092|
|1106603326 |
|598673123  |
|-280330196 |
|14895

In [None]:
from pyspark.sql.functions import udf

#event_data_frame.select("*").filter("user_id >= 199518013 and user_id <= 1139779140").show()

event_data_frame.select(
    "user_id",
    F.when(event_data_frame["user_id"] < 1095049125, "LOW_USER_ID")
    .when(event_data_frame["user_id"] == 1037710827, "NA_MOSCA" )
     .otherwise("HIGH_USER_ID")
     .alias("user_id_category")
).show(100, truncate=False)






In [None]:
# REVERSE STRING 
def reverseString(my_string:str) -> str:
    return my_string[::-1]

event_data_frame = spark.read.csv("file:///data/db1/events.csv", sep=',', header=True, schema=schema)

udfReverseString = udf(reverseString, StringType())

df_reverse = event_data_frame.withColumn("reversed_host", udfReverseString(event_data_frame["host"]))

df_reverse.select("*").show()

## Monotonically increasing IDs

In [31]:
from pyspark.sql import functions as F

event_data_frame_mono = df_reverse.withColumn("monotonic_id", F.monotonically_increasing_id())

event_data_frame_mono.show(100, truncate=False)


+-----------+-----------+-----------------------------------+---------------------+-------------------------------------+--------------------------+---------------------+------------+
|user_id    |device_id  |referrer                           |host                 |url                                  |event_time                |reversed_host        |monotonic_id|
+-----------+-----------+-----------------------------------+---------------------+-------------------------------------+--------------------------+---------------------+------------+
|1037710827 |532630305  |NULL                               |www.zachwilson.tech  |/                                    |2021-03-08 17:27:24.241000|hcet.nosliwhcaz.www  |0           |
|925588856  |532630305  |NULL                               |www.eczachly.com     |/                                    |2021-05-10 11:26:21.247000|moc.ylhcazce.www     |1           |
|-1180485268|532630305  |NULL                               |admin.zachwilson.te