In [31]:
from NaiveBayes import spark, process_dataframe, sc
from math import log

## Training

In [32]:
df = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("../../DataFiles/airline-train.csv")

df, string_indexers, bucketizers = process_dataframe(df)

                                                                                

In [33]:
LABEL_COL = "satisfaction_index"
FEATURES_COL = list(set(df.columns) - set([LABEL_COL]))
print("Label used: ", LABEL_COL)
print("Features used: ", FEATURES_COL)

Label used:  satisfaction_index
Features used:  ['Leg room service', 'Checkin service', 'Seat comfort', 'Inflight entertainment', 'Flight Distance_bucket', 'Type of Travel_index', 'Departure/Arrival time convenient', 'Age_bucket', 'Online boarding', 'Gate location', 'Cleanliness', 'Ease of Online booking', 'Class_index', 'Inflight service', 'Inflight wifi service', 'Arrival Delay in Minutes_bucket', 'Baggage handling', 'Departure Delay in Minutes_bucket', 'Customer Type_index', 'On-board service', 'Gender_index', 'Food and drink']


In [34]:
# Prior probability
def mapper(rows):
    result = []
    for row in rows:
        result.append((row[LABEL_COL], 1))
    return result

def reducer(row):
    key, iterable = row
    count = 0
    for val in iterable:
        count += val
    return [(key, count)]

prior = df.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer).collect()
total = sum([x[1] for x in prior])
prior_dict = dict([(x[0], log(x[1]/total)) for x in prior])
print("Prior probability: ", prior_dict)



Prior probability:  {0: -0.5680907948876823, 1: -0.8361084357191146}


                                                                                

In [35]:
# Posterior probability
def mapper(split):
    result = []
    dict = {}

    for row in split:
        for feature in FEATURES_COL:
            if row[LABEL_COL] not in dict:
                dict[row[LABEL_COL]] = {}
            if feature not in dict[row[LABEL_COL]]:
                dict[row[LABEL_COL]][feature] = {}
            if row[feature] not in dict[row[LABEL_COL]][feature]:
                dict[row[LABEL_COL]][feature][row[feature]] = 0
            dict[row[LABEL_COL]][feature][row[feature]] += 1

    for label in dict:
        for feature in dict[label]:
            for value in dict[label][feature]:
                result.append((label, (feature, value, dict[label][feature][value])))
    return result

def reducer(sorted_pairs):
    key, iterable = sorted_pairs
    dict = {}
    iterable = list(iterable)
    total_count = 0
    result = []

    for i in range(len(iterable)):
        feature = iterable[i][0]
        value = iterable[i][1]
        value_count = iterable[i][2]
        total_count += value_count
        if feature not in dict:
            dict[feature] = {}
        if value not in dict[feature]:
            dict[feature][value] = 0
        dict[feature][value] += value_count

    for feature in dict:
        for value in dict[feature]:
            result.append((key, feature, value, dict[feature][value] / total_count * len(FEATURES_COL)))
    return result

posterior = df.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer).collect()
posterior_dict={}
for i in range(len(posterior)):
    label = posterior[i][0]
    feature = posterior[i][1]
    value = posterior[i][2]
    p_bin = posterior[i][3]
    if label not in posterior_dict:
        posterior_dict[label] = {}
    if feature not in posterior_dict[label]:
        posterior_dict[label][feature] = {}
    posterior_dict[label][feature][value] = log(p_bin)

print("Posterior probability: ", posterior_dict)



Posterior probability:  {0: {'Leg room service': {3: -1.3927378916221316, 5: -1.822360775947274, 4: -1.5891500255153403, 2: -1.4245131401365037, 1: -1.965696762032647, 0: -5.263116195778559}, 'Checkin service': {4: -1.3218538054942783, 1: -1.7924582157734041, 3: -1.322620823748565, 5: -1.9899515675430093, 2: -1.810000714960011, 0: -10.98014389718478}, 'Seat comfort': {5: -1.8493880045514404, 1: -1.8379753053119319, 2: -1.6286512454527142, 3: -1.3847452268780962, 4: -1.4399963077061697, 0: -10.98014389718478}, 'Inflight entertainment': {5: -1.8933280114940956, 1: -1.7020514231577792, 2: -1.4456207817201223, 4: -1.640443319902891, 3: -1.4380567215947164, 0: -8.341086567569523}, 'Flight Distance_bucket': {2: -2.1355192137994794, 0: -2.1936872188407803, 3: -2.117235602066153, 6: -2.203976797453506, 5: -2.120922503576649, 1: -2.106955993134675, 4: -2.1360959982422902, 8: -2.701969606241043, 9: -3.203189493862339, 7: -2.627825348958777}, 'Type of Travel_index': {1: -0.7097241862572721, 0: -0

                                                                                

In [36]:
def predict(row):
    dict = {}# Dictionary of label and probability
    for label in prior_dict:
        prob = prior_dict[label]
        for feature in FEATURES_COL:
            if row[feature] in posterior_dict[label][feature]:
                prob += posterior_dict[label][feature][row[feature]]
            else:
                prob += log(0.000001)# Smoothing
        dict[label] = prob
    return max(dict, key=dict.get)

In [37]:
# Trainig accuracy
def mapper(rows):
    result = []
    for row in rows:
        if predict(row) == row[LABEL_COL]:
            result.append((0,1))
        else:
            result.append((0,0))
    return result
  
def reducer(row):
    count = 0
    for val in row[1]:
        count += val
    return [(0, count)]

correct = df.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer).collect()[0][1]
total = df.count()
print("Training accuracy: ", correct / total)

                                                                                

Training accuracy:  0.894134795451474


## Validation

In [38]:
df_val = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("../../DataFiles/airline-val.csv")
df_val,_,_ = process_dataframe(df_val, string_indexers, bucketizers)

In [39]:
# validation accuracy
correct = df_val.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer).collect()[0][1]
total = df_val.count()
print("Validation accuracy: ", correct / total)

[Stage 69:>                                                         (0 + 1) / 1]

Validation accuracy:  0.8908585331942996


                                                                                

## Out of context analysis

In [51]:
import numpy as np

p_bin_class1 = np.array(list(posterior_dict[0]['Arrival Delay in Minutes_bucket'].values()))
p_bin_class1 = np.exp(p_bin_class1)
print('p_bin_class1',p_bin_class1)
p_bin_class2 = np.array(list(posterior_dict[1]['Arrival Delay in Minutes_bucket'].values()))
p_bin_class2 = np.exp(p_bin_class2)
print('p_bin_class2',p_bin_class2)
p_class1 = np.exp(prior_dict[0])
p_class2 = np.exp(prior_dict[1])
p_bin = p_class1 * p_bin_class1+ p_class2 * p_bin_class2
p_class1_bin = p_class1 * p_bin_class1 / p_bin
p_class2_bin = p_class2 * p_bin_class2 / p_bin
print('p_class1_bin',p_class1_bin)
print('p_class2_bin',p_class2_bin)

p_bin_class1 [0.10821677 0.11653066 0.54214014 0.11741656 0.11569586]
p_bin_class2 [0.6358554  0.08388088 0.07960443 0.1165334  0.08412589]
p_class1_bin [0.18200573 0.64491797 0.89902814 0.56845925 0.6426003 ]
p_class2_bin [0.81799427 0.35508203 0.10097186 0.43154075 0.3573997 ]


Bad pipe message: %s [b'\xb1d\x07\ty\x92`\xee\xb7\xbc\x13\xfa\x16\xa2}\x089d F\xb5\x92QTu\xe5-\xaeq\x86ryC\xa8=\x18-\xe3\xdd\x0f\xd4i\x01\x81\xcd\xe9\x91\xd1\xda)\x02\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t', b'7.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00']
Bad pipe message: %s [b'\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01']
Bad pipe message: %s [b"\xe3(\xb4\x85z\xfa\x87\ta\xd9\x08\xc8\x04\xa3\x90S\xbb+\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a