# Initialize Spark

In [0]:
import pyspark

# Load Data into Dataframe

In [0]:
df = sqlContext.sql("SELECT * FROM  airline")

In [0]:
df.count()

Out[75]: 129880

In [0]:
df = df.na.drop()

In [0]:
df.describe().show()

+-------+------------+------+-----------------+-----------------+---------------+--------+------------------+------------------+---------------------------------+-----------------+------------------+---------------------+----------------------+------------------+----------------------+------------------+------------------+------------------+------------------+------------------+-----------------+--------------------------+------------------------+
|summary|satisfaction|Gender|    Customer Type|              Age| Type of Travel|   Class|   Flight Distance|      Seat comfort|Departure/Arrival time convenient|   Food and drink|     Gate location|Inflight wifi service|Inflight entertainment|    Online support|Ease of Online booking|  On-board service|  Leg room service|  Baggage handling|   Checkin service|       Cleanliness|  Online boarding|Departure Delay in Minutes|Arrival Delay in Minutes|
+-------+------------+------+-----------------+-----------------+---------------+--------+------

# Modelling

In [0]:
## Extract String and columns

In [0]:
from pyspark.sql.types import StringType
type(df.schema[0].dataType)

Out[78]: pyspark.sql.types.StringType

In [0]:
isinstance(df.schema[0].dataType, StringType)

Out[79]: True

In [0]:
str_cols = []
nstr_cols = []
for i in df.schema:
    if(isinstance(i.dataType, StringType)):
        str_cols.append(i.name)
    else:
        nstr_cols.append(i.name)

In [0]:
df.columns

Out[81]: ['satisfaction',
 'Gender',
 'Customer Type',
 'Age',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Seat comfort',
 'Departure/Arrival time convenient',
 'Food and drink',
 'Gate location',
 'Inflight wifi service',
 'Inflight entertainment',
 'Online support',
 'Ease of Online booking',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Cleanliness',
 'Online boarding',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes']

## String Indexing and One Hot Encoder

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [0]:
str_cols

Out[83]: ['satisfaction', 'Gender', 'Customer Type', 'Type of Travel', 'Class']

### Index Label Columns

In [0]:
indexer = StringIndexer(inputCol='satisfaction', outputCol='label')
index = indexer.fit(df)
df = index.transform(df)

## Convert Features Columns into Vector OHE

In [0]:
ohe = []
indexer = []
encoder = []
for col in str_cols[1:]:
    output_ind = col + "Ind"
    output_vect = col + "Vec"
    indexer.append(StringIndexer(inputCol=col, outputCol=output_ind))
    encoder.append(OneHotEncoder(inputCol=output_ind, outputCol=output_vect))
    ohe.append(output_vect)


In [0]:
nstr_cols

Out[101]: ['Age',
 'Flight Distance',
 'Seat comfort',
 'Departure/Arrival time convenient',
 'Food and drink',
 'Gate location',
 'Inflight wifi service',
 'Inflight entertainment',
 'Online support',
 'Ease of Online booking',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Cleanliness',
 'Online boarding',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes']

### assemble all Feature Columns into one

In [0]:
input_cols = nstr_cols + ohe
assembler = VectorAssembler(inputCols=input_cols,outputCol="features")

## Logistic Regression

In [0]:
from pyspark.ml.classification import LogisticRegression

In [0]:
lr = LogisticRegression(featuresCol='features', labelCol="label", predictionCol='prediction')

## pipeline

In [0]:
from pyspark.ml import Pipeline

In [0]:
stages = indexer + encoder + [assembler, lr]
pipeline = Pipeline(stages=stages)

### Split the data into Train and Test in 70-30

In [0]:
train, test = df.randomSplit([0.7, 0.3])

In [0]:
fit_model = pipeline.fit(train)

In [0]:
result = fit_model.transform(test)

In [0]:
result.select('satisfaction', 'label', 'prediction', 'probability').show()

+------------+-----+----------+--------------------+
|satisfaction|label|prediction|         probability|
+------------+-----+----------+--------------------+
|dissatisfied|  1.0|       0.0|[0.87206612086272...|
|dissatisfied|  1.0|       0.0|[0.82074678114694...|
|dissatisfied|  1.0|       0.0|[0.71203485561072...|
|dissatisfied|  1.0|       0.0|[0.93924251699435...|
|dissatisfied|  1.0|       1.0|[0.18712622051982...|
|dissatisfied|  1.0|       1.0|[0.09396864514124...|
|dissatisfied|  1.0|       0.0|[0.63890634197709...|
|dissatisfied|  1.0|       1.0|[0.21412476994584...|
|dissatisfied|  1.0|       1.0|[0.42018694199280...|
|dissatisfied|  1.0|       0.0|[0.69379414473621...|
|dissatisfied|  1.0|       0.0|[0.76622069595541...|
|dissatisfied|  1.0|       0.0|[0.57204467623110...|
|dissatisfied|  1.0|       1.0|[0.01157427483969...|
|dissatisfied|  1.0|       0.0|[0.80933020399834...|
|dissatisfied|  1.0|       0.0|[0.77909043113628...|
|dissatisfied|  1.0|       0.0|[0.900011983903

# Evaluator

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
evalator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label")

In [0]:
AUC = evalator.evaluate(result)

In [0]:
AUC

Out[121]: 0.8357227813922075

In [0]:
result.columns

Out[123]: ['satisfaction',
 'Gender',
 'Customer Type',
 'Age',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Seat comfort',
 'Departure/Arrival time convenient',
 'Food and drink',
 'Gate location',
 'Inflight wifi service',
 'Inflight entertainment',
 'Online support',
 'Ease of Online booking',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Cleanliness',
 'Online boarding',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes',
 'label',
 'GenderInd',
 'Customer TypeInd',
 'Type of TravelInd',
 'ClassInd',
 'GenderVec',
 'Customer TypeVec',
 'Type of TravelVec',
 'ClassVec',
 'features',
 'rawPrediction',
 'probability',
 'prediction']