# Create a spark session

In [10]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
import pyspark.sql.functions as F
import pyspark.sql.types as T
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler

# change the account name to your email account
account='sli'

# define a root path to access the data in the DataAnalysisWithPythonAndPySpark
data_path='/net/clusterhn/home/'+account+'/isa460/data/'

# append path to helper_functions to system path
sys.path.append('/net/clusterhn/home/sli/isa460_sli')

import helper_functions as H

# check if the Spark session is active. If it is activate, close it

try:
    if spark:
        spark.stop()
except:
    pass    

spark = (SparkSession.builder.appName("Preparing Featurers for Machine Learning")
        .config("spark.port.maxRetries", "100")
        .config("spark.sql.mapKeyDedupPolicy", "LAST_WIN")  # This configuration allow the duplicate keys in the map data type.
        .config("spark.driver.memory", "8g")
        .getOrCreate())

# confiture the log level (defaulty is WARN)
spark.sparkContext.setLogLevel('ERROR')

# Pre-processing/cleaning/transform data using the steps in Chapter 12

In [16]:
# load data 
food=spark.read.csv(data_path+'recipes/epi_r.csv', inferSchema=True, header=True)

# Satndardizing columns names using toDF
def sanitize_column_name(name):
    """Drops unwanted characters from the column name.
 
    We replace spaces, dashes and slashes with underscore,
    and only keep alphanumeric characters.
    """
    answer = name
    for i, j in ((" ", "_"), ("-", "_"), ("/", "_"), ("&", "and")):   
        answer = answer.replace(i, j).lower()
    return "".join(
        [
            char
            for char in answer
            if char.isalpha() or char.isdigit() or char == "_"       
        ]
    )
# use toDF() to apply functions to all columns in the data frame
food = food.toDF(*[sanitize_column_name(name) for name in food.columns])

# for cakeweek and wasteless, only keep 0, 1 and null. Drop the rest

food = food.where(
    (
        F.col("cakeweek").isin([0.0, 1.0])    
        | F.col("cakeweek").isNull()         
    )
    & (
        F.col("wasteless").isin([0.0, 1.0])    
        | F.col("wasteless").isNull()         
    )
)

# rating and calories should be double. Covert the type from string to double

food=food.withColumn('rating', F.col('rating').cast('Double')) \
     .withColumn('calories', F.col('calories').cast('Double'))

# Create four top level variables
IDENTIFIERS = ["title"]
 
CONTINUOUS_COLUMNS = [
    "rating",
    "calories",
    "protein",
    "fat",
    "sodium",
]
 
TARGET_COLUMN = ["dessert"]
 
BINARY_COLUMNS = [
    x
    for x in food.columns
    if x not in CONTINUOUS_COLUMNS
    and x not in TARGET_COLUMN
    and x not in IDENTIFIERS
]

# drop records where all features are null

food = food.dropna(
    how="all",
    subset=[x for x in food.columns if x not in IDENTIFIERS]
)

# Deal with missing values (Weeding our useless records and imputing binary features)
# drop records whereh target column is null

food = food.dropna(subset=TARGET_COLUMN)

# impute binary columns. Fill null value with 0

food=food.fillna(0.0, subset=BINARY_COLUMNS)

# Take care of extreme values/outliners

# return 99 percentile of each feature of the following features

selected_columns=[ "calories", "protein", "fat","sodium"]

maximum={}

for c in selected_columns:
    maximum[c]=food.select(F.percentile_approx(c, 0.99)).collect()[0][0]
    
# for the above feature, replace any value over 99 percentile to the value at 99 percentile

for k, v in maximum.items():
    food=food.withColumn(k, F.when(F.isnull(F.col(k)), F.col(k)).otherwise(
        F.least(F.col(k), F.lit(v))
    ))

### Weeding out the rare binary occurrence columns   
# for binary variables, remove features with less then 10 of 0 or 1.

inst_sum_of_binary_columns = [
    F.sum(F.col(x)).alias(x) for x in BINARY_COLUMNS
]
 
sum_of_binary_columns = (
    food.select(*inst_sum_of_binary_columns).head().asDict()         
)

num_rows=food.count()
too_rare_features = [
    k
    for k, v in sum_of_binary_columns.items()
    if v < 10 or v > (num_rows - 10)
]

BINARY_COLUMNS = list(set(BINARY_COLUMNS) - set(too_rare_features)) 

# Feature Creation: protein_ratio and fat_ration

food = food.withColumn(
    "protein_ratio", F.col("protein") * 4 / F.col("calories")
).withColumn(
    "fat_ratio", F.col("fat") * 9 / F.col("calories")
)                                                           
 
food = food.fillna(0.0, subset=["protein_ratio", "fat_ratio"])
 
CONTINUOUS_COLUMNS += ["protein_ratio", "fat_ratio"]

final_columns=IDENTIFIERS+CONTINUOUS_COLUMNS+TARGET_COLUMN+BINARY_COLUMNS

food=food.select(final_columns)


                                                                                

In [18]:
len(food.columns)

515

In [20]:
# write the processed data to a directory

food.write.mode('overwrite').parquet(data_path+'recipes/recipes_cleaned')

                                                                                