In [26]:
import findspark
import pyspark
from pyspark.sql import *
from pyspark.ml.feature import Imputer
from pyspark.sql.functions import struct
from pyspark.sql import SparkSession, functions as F, types

In [27]:
findspark.init()
spark = SparkSession.builder.appName('Spark_Imputer').master("local").getOrCreate()

In [28]:
df = spark.read.load("MOCK_DATA.csv",format="csv", inferSchema="true", header="true")
df.show(40)

+----+----------+-------------+--------------------+------+---------------+----+----+
|  id|first_name|    last_name|               email|gender|     ip_address| age|age1|
+----+----------+-------------+--------------------+------+---------------+----+----+
|   1|  Christal|        Deval|   cdeval0@google.es|Female|   78.41.59.167|null|null|
|   2|   Ginevra|      Bosanko|gbosanko1@marketw...|Female|   13.49.23.173|null|null|
|   3|     Milka|       Govini|    mgovini2@mtv.com|Female|220.205.250.120|   3|   3|
|   4|       Ali|    Flaunders|aflaunders3@cisco...|  Male|  108.75.25.221|  85|  85|
|   5|    Silvia|      Mitcham|   smitcham4@mtv.com|Female|  83.90.147.253|null|null|
|   6|    Elicia|      Mattson|emattson5@typepad...|Female| 63.223.250.214|null|null|
|   7|    Barton|    Duesberry| bduesberry6@soup.io|  Male|   97.254.22.99|null|null|
|   8| Selestina|     Gartrell|sgartrell7@engadg...|Female|184.108.174.192|null|null|
|   9|    Saudra|         Cadd|scadd8@friendfeed...|Fe

In [29]:
col = "age"

#Try with type = mean or median

type="median"
# type3 = "median"
imputer = Imputer(inputCols=[col], outputCols=[col+"_imputed_"+type]).setStrategy(type)

model = imputer.fit(df)

model.transform(df).show()

+---+----------+----------+--------------------+------+---------------+----+----+------------------+
| id|first_name| last_name|               email|gender|     ip_address| age|age1|age_imputed_median|
+---+----------+----------+--------------------+------+---------------+----+----+------------------+
|  1|  Christal|     Deval|   cdeval0@google.es|Female|   78.41.59.167|null|null|                53|
|  2|   Ginevra|   Bosanko|gbosanko1@marketw...|Female|   13.49.23.173|null|null|                53|
|  3|     Milka|    Govini|    mgovini2@mtv.com|Female|220.205.250.120|   3|   3|                 3|
|  4|       Ali| Flaunders|aflaunders3@cisco...|  Male|  108.75.25.221|  85|  85|                85|
|  5|    Silvia|   Mitcham|   smitcham4@mtv.com|Female|  83.90.147.253|null|null|                53|
|  6|    Elicia|   Mattson|emattson5@typepad...|Female| 63.223.250.214|null|null|                53|
|  7|    Barton| Duesberry| bduesberry6@soup.io|  Male|   97.254.22.99|null|null|          

In [None]:

# df = df.dropna()
# freq = df.stat.freqItems(df.columns,0.4)
# col = "age"+"_freqItems"

In [None]:
# mode = freq.collect()[0][6]
# mode = mode[0]
# print(mode)

In [None]:
# freq.show()

In [30]:
#Mode
df = spark.read.load("MOCK_DATA.csv",format="csv", inferSchema="true", header="true")
#Specify the list of input columns you want to impute
cols = ["age","age1"]

for col in cols:
    
    df_mode= df.where(df[col].isNotNull())
    df_mode.dropna()
    cnts = df_mode.groupBy(col).count().orderBy("count",ascending=False)
    mode = cnts.take(1)[0][0]
    print("Mode= ", mode)

    df = df.na.fill(mode)
df.show(40)

Mode=  86
Mode=  86
+---+----------+-------------+--------------------+------+---------------+---+----+
| id|first_name|    last_name|               email|gender|     ip_address|age|age1|
+---+----------+-------------+--------------------+------+---------------+---+----+
|  1|  Christal|        Deval|   cdeval0@google.es|Female|   78.41.59.167| 86|  86|
|  2|   Ginevra|      Bosanko|gbosanko1@marketw...|Female|   13.49.23.173| 86|  86|
|  3|     Milka|       Govini|    mgovini2@mtv.com|Female|220.205.250.120|  3|   3|
|  4|       Ali|    Flaunders|aflaunders3@cisco...|  Male|  108.75.25.221| 85|  85|
|  5|    Silvia|      Mitcham|   smitcham4@mtv.com|Female|  83.90.147.253| 86|  86|
|  6|    Elicia|      Mattson|emattson5@typepad...|Female| 63.223.250.214| 86|  86|
|  7|    Barton|    Duesberry| bduesberry6@soup.io|  Male|   97.254.22.99| 86|  86|
|  8| Selestina|     Gartrell|sgartrell7@engadg...|Female|184.108.174.192| 86|  86|
|  9|    Saudra|         Cadd|scadd8@friendfeed...|Femal

In [31]:
#Forward Fill
import sys
from pyspark.sql import Window
from pyspark.sql.functions import last, first  
from pyspark.sql import functions as F

df = spark.read.load("MOCK_DATA.csv",format="csv", inferSchema="true", header="true")
df = df.withColumn('dummy', F.lit(1))


#Set the window dimension
window_f = Window.partitionBy('dummy')\
               .rowsBetween(-sys.maxsize, 0)

#Specify the list of input columns you want to impute
cols = ["age","age1"]

for col in cols:
    print(col)
    # define the forward-filled column
    f_filled_column = last(df[col], ignorenulls=True).over(window_f)

    # do the fill
    df_f_filled = df.withColumn('f_fill_'+col, f_filled_column)
    df = df_f_filled

    
#drop the dummy column
df = df.drop('dummy')

# show the imputed df 

df.show(40)  

age
age1
+----+----------+-------------+--------------------+------+---------------+----+----+----------+-----------+
|  id|first_name|    last_name|               email|gender|     ip_address| age|age1|f_fill_age|f_fill_age1|
+----+----------+-------------+--------------------+------+---------------+----+----+----------+-----------+
|   1|  Christal|        Deval|   cdeval0@google.es|Female|   78.41.59.167|null|null|      null|       null|
|   2|   Ginevra|      Bosanko|gbosanko1@marketw...|Female|   13.49.23.173|null|null|      null|       null|
|   3|     Milka|       Govini|    mgovini2@mtv.com|Female|220.205.250.120|   3|   3|         3|          3|
|   4|       Ali|    Flaunders|aflaunders3@cisco...|  Male|  108.75.25.221|  85|  85|        85|         85|
|   5|    Silvia|      Mitcham|   smitcham4@mtv.com|Female|  83.90.147.253|null|null|        85|         85|
|   6|    Elicia|      Mattson|emattson5@typepad...|Female| 63.223.250.214|null|null|        85|         85|
|   7|    

In [32]:
#Backward Fill
import sys
from pyspark.sql import Window
from pyspark.sql.functions import last, first  
from pyspark.sql import functions as F

df = spark.read.load("MOCK_DATA.csv",format="csv", inferSchema="true", header="true")
df = df.withColumn('dummy', F.lit(1))

#Set the window dimension
window_b = Window.partitionBy('dummy')\
               .rowsBetween(0,sys.maxsize)

#Specify the list of input columns you want to impute
cols = ["age","age1"]

for col in cols:

    # define the backward-filled column
    b_filled_column = first(df[col], ignorenulls=True).over(window_b)


    # do the fill
    df_b_filled = df.withColumn('b_fill_'+col, b_filled_column)
    
    df = df_b_filled

df = df.drop('dummy')

# show the imputed df 

df.show(40)  

+----+----------+-------------+--------------------+------+---------------+----+----+----------+-----------+
|  id|first_name|    last_name|               email|gender|     ip_address| age|age1|b_fill_age|b_fill_age1|
+----+----------+-------------+--------------------+------+---------------+----+----+----------+-----------+
|   1|  Christal|        Deval|   cdeval0@google.es|Female|   78.41.59.167|null|null|         3|          3|
|   2|   Ginevra|      Bosanko|gbosanko1@marketw...|Female|   13.49.23.173|null|null|         3|          3|
|   3|     Milka|       Govini|    mgovini2@mtv.com|Female|220.205.250.120|   3|   3|         3|          3|
|   4|       Ali|    Flaunders|aflaunders3@cisco...|  Male|  108.75.25.221|  85|  85|        85|         85|
|   5|    Silvia|      Mitcham|   smitcham4@mtv.com|Female|  83.90.147.253|null|null|        28|         28|
|   6|    Elicia|      Mattson|emattson5@typepad...|Female| 63.223.250.214|null|null|        28|         28|
|   7|    Barton|  