In [1]:
%run Users/quentin.picard@gmail.com/Shared_Team/Helpers

# Load Data

In [3]:
impressions = spark.read.parquet('/mnt/nycdsa/yongguang/impressions_mini_final/')

In [4]:
from pyspark.sql.functions import *
# impressions.groupBy('landingPage').count().count()
impressions.agg(approx_count_distinct(impressions.zip)).collect()

# Feature Engineering

In [7]:
allColumns = ['campaign', 'adSize', 'adType', 'deviceType', 'gender', 'os', 'landingPage', \
              'region', 'country', 'venueType', 'timestamp', 'iabCategories', 'age', 'clicked', 'TrainTestFlag']
impressions = impressions.select(*allColumns)
catTransformer = iabCategoriesTransformer(inputCol="iabCategories")
impressions = catTransformer.transform(impressions)
impressions = cleanup_age_category(impressions)
impressions = cleanup_gender(impressions)
impressions = cleanup_os(impressions)
impressions = clean_landingPage(impressions)
impressions = cleanup_country(impressions)
impressions = format_region(impressions)
impressions = reduce_cardinality(impressions, 'region', 55)
impressions = cleanup_timestamp(impressions)

In [8]:
display(impressions)

# Modeling

In [10]:
train, test = impressions.filter(impressions.TrainTestFlag==0), impressions.filter(impressions.TrainTestFlag==1)

train.cache()
test.cache()

In [11]:
train_tiny = train.sampleBy('clicked', fractions={0: 0.01, 1: 0.01}, seed=0)

### Class Weighting Begin (Optional)

In [13]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
balanceRatio = lambda df: (1 - float(df.where(df.clicked == 0).count())/df.count())
calculateWeights = udf(lambda d, br: 1 * br if d==0.0 else (1 * (1.0 - br)), DoubleType())

In [14]:
train =\
train.withColumn('classWeight', calculateWeights(train.clicked, lit(balanceRatio(train))))

### / Class Weighting End

In [16]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder 

categoricalColumns = [ "region", "adSize", "adType", "deviceType", "gender", "landingPage",\
                      "os", "venueType", "ageGroup", "timestamp_weekday", "timestamp_hour"] #"landingPage", 

indexStages = []
for categoricalCol in categoricalColumns:
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index", handleInvalid="skip")
  indexStages.append(stringIndexer)

encodeStages = []
for categoricalCol in categoricalColumns:
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"ClassVec")
  encodeStages.append(encoder)

In [17]:
from pyspark.ml.feature import VectorAssembler

numericalColumns = ["IAB"+str(i) for i in range(1, 27)]

Assembler = VectorAssembler(inputCols= numericalColumns + \
                                      [categoricalCol + "ClassVec" for categoricalCol in categoricalColumns], 
                            outputCol="features")

In [18]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

lr = LogisticRegression(featuresCol="features", 
                        labelCol="clicked")
#                         weightCol="classWeight") # apply weights to observations

lrPipeline = Pipeline(stages = indexStages + encodeStages +[Assembler, lr])

lrModel = lrPipeline.fit(train_tiny)

In [19]:
lrPredict = lrModel.transform(test)
lrPredict.crosstab('prediction', 'clicked').show()

In [20]:
def lr_model_summary(model, test_data):
  '''
  
  '''
  model_fe = model.copy()
  model_fe.stages = model_fe.stages[:-1]
  return model.stages[-1].summary, model.stages[-1].evaluate(model_fe.transform(test_data))

In [21]:
train_summary, test_summary = lr_model_summary(lrModel,test)

In [22]:
import matplotlib.pyplot as plt
train_summary.objectiveHistory

In [23]:
lr.explainParam(regParam)
# lr.getMaxIter()

In [24]:
train_summary.totalIterations

In [25]:
lrModelSummary = model_summary(lrModel, test)
lrModelSummary.areaUnderROC

In [26]:
# Area under precision-recall curve
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator_PR = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                         labelCol="clicked",
                                         metricName="areaUnderPR")
lr_AUPR = evaluator_PR.evaluate(lrPredict)
print lr_AUPR

In [27]:
#plot_evaluation_curve([lrModel, lr_constant_class_weight_downsample], ['lr_balance_class_weight_downsample', 'lr_constant_class_weight_downsample'], test, curve_type='pr')

In [28]:
from sklearn.metrics import log_loss

log_loss(y_true, y_score, eps=1e-15, normalize=True) # Quentin: to add

In [29]:
plot_evaluation_curve([lrModel], ['lrModel'], test)

In [30]:
lr_constant_class_weight_downsample = lrModel.copy()

# Save Model

In [32]:
lrModel.write().overwrite().save('/mnt/nycdsa/yongguang/models/lr_constant_class_weight')

In [33]:
lr_balanced_class_weight = lrModel.copy()

Daniel: looking at coefficients

In [35]:
lrPredict.printSchema()

In [36]:
display(lrPredict.select(categoricalColumns + [x + 'ClassVec' for x in categoricalColumns]))

In [37]:
display(lrPredict.select('rawPrediction', 'features', 'probability','prediction'))

In [38]:
log_reg_model = lrModel.stages[-1]

In [39]:
type(str(log_reg_model.summary.featuresCol))

In [40]:
from itertools import chain

attrs = sorted(
    (attr["idx"], attr["name"]) for attr in (chain(*lrPredict
        .schema[str(log_reg_model.summary.featuresCol)]
        .metadata["ml_attr"]["attrs"].values())))

In [41]:
features_coef = [(name, log_reg_model.coefficients[idx]) for idx, name in attrs]

In [42]:
features_coef

In [43]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
 
objects = [x[0] for x in features_coef]
y_pos = np.arange(len(features_coef))
coefficients = [x[1] for x in features_coef]

plt.clf()
plt.figure(figsize = (15,35))
plt.barh(y_pos, coefficients, align='center', alpha=0.5)
plt.yticks(y_pos, objects)
plt.xlabel('Normalized Feature Weights')
plt.title('LR Model Coefficients')
 
display(plt.show())

In [44]:
trainingSummary = log_reg_model.summary

In [45]:
trainingSummary.

In [46]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(log_reg_model.coefficients))
print("Intercept: %s" % str(log_reg_model.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = log_reg_model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))


In [48]:
print("areaUnderROC: %f" % trainingSummary.areaUnderROC)

In [49]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator_ROC = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                         labelCol="clicked",
                                         metricName="areaUnderROC")
lr_AUROC = evaluator_ROC.evaluate(lrPredict)
print lr_AUROC

In [50]:
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

In [51]:
display(lrModelSummary.pr) # this ignores class weights, which might give invalid results

# Model Selection via Cross-Validation

In [56]:
dbutils.fs.ls('/mnt/nycdsa/yongguang/models/')

In [57]:
from pyspark.ml import PipelineModel

temp = PipelineModel.load('dbfs:/mnt/nycdsa/yongguang/lr_constant_class_weight/')

In [58]:
# hi dimitri :)