# PySpark DataFrame Part 2

In [11]:
from pyspark.sql import SparkSession

In [12]:
spark = SparkSession.builder.appName('Dataframes Part 2').getOrCreate()

In [3]:
spark

In [13]:
df = spark.read.csv('products.csv', header=True, inferSchema=True, mode="DROPMALFORMED")
df.show()

+------------------+------+
|             Items|Prices|
+------------------+------+
|            iPhone|   999|
|       Macbook Air|   999|
|       Macbook Pro|  1299|
|              Ipad|   499|
|           Airpods|   299|
|Backcover - iPhone|  NULL|
|  Backcover - iPad|     5|
|   Cover - Airpods|     7|
|              NULL|  1499|
|              NULL|  NULL|
|      MDR Terminal|  2999|
+------------------+------+



In [14]:
df.na.drop().show()

+----------------+------+
|           Items|Prices|
+----------------+------+
|          iPhone|   999|
|     Macbook Air|   999|
|     Macbook Pro|  1299|
|            Ipad|   499|
|         Airpods|   299|
|Backcover - iPad|     5|
| Cover - Airpods|     7|
|    MDR Terminal|  2999|
+----------------+------+



In [15]:
### how = any | all, threshold = number of non null columns allowed, subset

df.na.drop(how="any", thresh=2).show()

+----------------+------+
|           Items|Prices|
+----------------+------+
|          iPhone|   999|
|     Macbook Air|   999|
|     Macbook Pro|  1299|
|            Ipad|   499|
|         Airpods|   299|
|Backcover - iPad|     5|
| Cover - Airpods|     7|
|    MDR Terminal|  2999|
+----------------+------+



In [16]:
df.na.drop(how="any", subset=['Items']).show()

+------------------+------+
|             Items|Prices|
+------------------+------+
|            iPhone|   999|
|       Macbook Air|   999|
|       Macbook Pro|  1299|
|              Ipad|   499|
|           Airpods|   299|
|Backcover - iPhone|  NULL|
|  Backcover - iPad|     5|
|   Cover - Airpods|     7|
|      MDR Terminal|  2999|
+------------------+------+



In [17]:
### Replace NULL with an identifier
df.na.fill({'Items': 'Product Not Exist', 'Prices': 0}).show()

+------------------+------+
|             Items|Prices|
+------------------+------+
|            iPhone|   999|
|       Macbook Air|   999|
|       Macbook Pro|  1299|
|              Ipad|   499|
|           Airpods|   299|
|Backcover - iPhone|     0|
|  Backcover - iPad|     5|
|   Cover - Airpods|     7|
| Product Not Exist|  1499|
| Product Not Exist|     0|
|      MDR Terminal|  2999|
+------------------+------+



In [2]:
%pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [18]:
from pyspark.ml.feature import Imputer

In [22]:
imputer = Imputer(
    inputCols = ['Prices'],
    outputCols = ["{}_imputed".format(c) for c in ['Prices']]
).setStrategy("median")
imputer.fit(df).transform(df).show()

+------------------+------+--------------+
|             Items|Prices|Prices_imputed|
+------------------+------+--------------+
|            iPhone|   999|           999|
|       Macbook Air|   999|           999|
|       Macbook Pro|  1299|          1299|
|              Ipad|   499|           499|
|           Airpods|   299|           299|
|Backcover - iPhone|  NULL|           999|
|  Backcover - iPad|     5|             5|
|   Cover - Airpods|     7|             7|
|              NULL|  1499|          1499|
|              NULL|  NULL|           999|
|      MDR Terminal|  2999|          2999|
+------------------+------+--------------+

