In [24]:
from pyspark.context import SparkContext 
from pyspark.sql import SQLContext
from pyspark.ml.regression import RandomForestRegressor
from pyspark.sql import Row
from pyspark.ml.linalg import DenseVector
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.classification import NaiveBayes
%pylab inline
sc = SparkContext.getOrCreate()

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [15]:
# Load data.
root = "./track2/"
title = ["AdID", "QueryID", "Depth", "Position", "UserID", "Impression", "Click"]
trainRDD = sc.textFile("./small.csv").map(lambda x: x.split(","))
trainDF = trainRDD.toDF(title)
trainDF.show(10)

+-----------+---------+-----+--------+--------+----------+-----+
|       AdID|  QueryID|Depth|Position|  UserID|Impression|Click|
+-----------+---------+-----+--------+--------+----------+-----+
|  3053284.0| 438228.0|  2.0|     1.0| 6232386|       1.0|  0.0|
|2.0043867E7|   5423.0|  2.0|     1.0|11181639|       1.0|  0.0|
|2.1331238E7|   2399.0|  2.0|     2.0|  170443|       1.0|  0.0|
|2.0036558E7|  31802.0|  2.0|     2.0|11183057|       1.0|  0.0|
|  9027213.0|      5.0|  2.0|     1.0|11183065|       1.0|  1.0|
|2.0915048E7|    177.0|  2.0|     2.0|11183646|       1.0|  0.0|
|  4370406.0|   1693.0|  3.0|     3.0| 3515210|       2.0|  0.0|
|2.0644045E7|     32.0|  1.0|     1.0|  678057|       1.0|  0.0|
|  2089356.0|1475900.0|  2.0|     2.0|11185489|       2.0|  0.0|
|  6454129.0|   1500.0|  2.0|     2.0| 3515981|       1.0|  0.0|
+-----------+---------+-----+--------+--------+----------+-----+
only showing top 10 rows



In [32]:
# Divide labels into 100 categortes.
trainDF = (trainDF.withColumn("Click",trainDF["Click"].cast("double"))
            .withColumn("Impression",trainDF["Impression"].cast("double"))
            .withColumn("AdID",trainDF["AdID"].cast("double"))
            .withColumn("QueryID",trainDF["QueryID"].cast("double"))
            .withColumn("Depth",trainDF["Depth"].cast("double"))
            .withColumn("Position",trainDF["Position"].cast("double"))
            .withColumn("UserID",trainDF["UserID"].cast("double")))
trainDF = trainDF.withColumn("CTR", trainDF["Click"] / trainDF["Impression"])
trainDF = trainDF.withColumn("100ctr", 100 * trainDF["CTR"]-(100 * trainDF["CTR"] % 1))

In [17]:
# Part 1: Group Duplicates To Reduce Data Size
def scoreClickAUC(numClicks, numImpressions, predictedCtr):
    """
    Calculates the area under the ROC curve (AUC) for click rates
    
    Parameters
    ----------
    numClicks : a list containing the number of clicks

    numImpressions : a list containing the number of impressions

    predictedCtr : a list containing the predicted click-through rates

    Returns
    -------
    auc : the area under the ROC curve (AUC) for click rates
    """
    iSorted = sorted(range(len(predictedCtr)), key=lambda i: predictedCtr[i], reverse=True)
    aucTemp = 0.0
    clickSum = 0.0
    oldClickSum = 0.0
    noClick = 0.0
    noClickSum = 0.0
    print(iSorted[:10])
    lastCtr = predictedCtr[iSorted[0]] + 1.0

    for i in range(len(predictedCtr)):
        if lastCtr != predictedCtr[iSorted[i]]: 
            aucTemp += (clickSum + oldClickSum) * noClick / 2.0        
            oldClickSum = clickSum
            noClick = 0.0
            lastCtr = predictedCtr[iSorted[i]]
        noClick += numImpressions[iSorted[i]] - numClicks[iSorted[i]]
        noClickSum += numImpressions[iSorted[i]] - numClicks[iSorted[i]]
        clickSum += numClicks[iSorted[i]]
    aucTemp += (clickSum + oldClickSum) * noClick / 2.0
    auc = aucTemp / (clickSum * noClickSum)
    return auc

In [10]:
# Gradient-boosted Tree
row = Row("label", "features", "impression", "click")
train = trainDF["100ctr", "AdID", "Depth", "Position", "QueryID", "UserID", "Impression", "Click"]
lf = train.rdd.map(lambda r: (row(r[0], DenseVector(r[1:-2]), r[-2], r[-1]))).toDF()
trainTemp, validateTemp = lf.randomSplit([0.7, 0.3], seed=153)
t = trainTemp.groupBy("label").count()
trainTemp = trainTemp.join(t, ["label"], "outer")
gbt = GBTRegressor(featuresCol="features", maxIter=50, labelCol="label").fit(trainTemp.filter(trainTemp["count"] > 15))
predictions = gbt.transform(validateTemp)
[pCTR,click,impression] = [predictions.select("prediction").rdd.flatMap(lambda r:r).collect(),
                           predictions.select("click").rdd.flatMap(lambda r:r).collect(),
                           predictions.select("impression").rdd.flatMap(lambda r:r).collect()]
res = scoreClickAUC(click,impression,pCTR)
print(res)
res = predictions.drop("label")
res = res.drop("features")
res.write.csv("testPrediction.csv")

[3864, 3865, 4290, 4291, 4294, 4510, 4512, 107459, 107916, 107917]
0.701323839912


In [18]:
testPath = root + "test.txt"
testRDD = sc.textFile(testPath,2)
test = testRDD.map(lambda x : x.split("\t"))
titleTest = ["Click", "Impression", "DisplayURl", "AdID", "AdvertiseId", "Depth", "Position", "QueryID", "KeywordID","TitleID","DescriptionID", "UserID"]
testDF = test.toDF(titleTest[2:])
testDF = testDF.drop("DisplayURl")
testDF = testDF.drop("AdvertiseId")
testDF = testDF.drop("KeywordID")
testDF = testDF.drop("TitleID")
testDF = testDF.drop("DescriptionID")
testDF = (testDF.withColumn("AdID",testDF["AdID"].cast("double"))
           .withColumn("QueryID",testDF["QueryID"].cast("double"))
           .withColumn("Depth",testDF["Depth"].cast("double"))
           .withColumn("Position",testDF["Position"].cast("double"))
           .withColumn("UserID",testDF["UserID"].cast("double")))
testDF = testDF.withColumn("label",testDF["AdID"] * 0)
testDF = testDF.withColumn("click",testDF["AdID"] * 0)
testDF = testDF.withColumn("impression",testDF["AdID"] * 0)
testDF = testDF[["label", "AdID", "Depth", "Position", "QueryID", "UserID", "impression", "click"]]
rowT = Row("label", "features")
testLF = testDF.rdd.map(lambda r: (row(r[0], DenseVector(r[1:-2]), r[-2],r[-1]))).toDF()
testLF.show(5)

+-----+--------------------+----------+-----+
|label|            features|impression|click|
+-----+--------------------+----------+-----+
|  0.0|[2.2166806E7,3.0,...|       0.0|  0.0|
|  0.0|[3111380.0,3.0,1....|       0.0|  0.0|
|  0.0|[2.2111129E7,3.0,...|       0.0|  0.0|
|  0.0|[2.2166807E7,2.0,...|       0.0|  0.0|
|  0.0|[2.0180629E7,2.0,...|       0.0|  0.0|
+-----+--------------------+----------+-----+
only showing top 5 rows



In [20]:
validateTemp.show(5)
preT = gbt.transform(testLF)
preTe = preT.drop("features")
preTe.write.csv("testPrediction.csv")

+-----+--------------------+----------+-----+
|label|            features|impression|click|
+-----+--------------------+----------+-----+
|  0.0|[1000467.0,2.0,2....|       1.0|  0.0|
|  0.0|[1000515.0,2.0,1....|       1.0|  0.0|
|  0.0|[1000515.0,2.0,1....|       1.0|  0.0|
|  0.0|[1000515.0,2.0,2....|       1.0|  0.0|
|  0.0|[1000515.0,3.0,2....|       4.0|  0.0|
+-----+--------------------+----------+-----+
only showing top 5 rows



In [21]:
# Random Forest
row = Row("label", "features", "click", "impression")
lf = train.rdd.map(lambda r: (row(r[0], DenseVector(r[1:5]), r[6],r[7]))).toDF()
trainTemp, validateTemp =lf.randomSplit([0.7, 0.3], seed=121)
randomForest = RandomForestRegressor(numTrees=100, labelCol="label",featuresCol="features").fit(trainTemp)
predictions = randomForest.transform(validateTemp)
predictions.select("prediction", "click", "impression").show(5)

+------------------+-----+----------+
|        prediction|click|impression|
+------------------+-----+----------+
| 6.853852520724911|  1.0|       0.0|
|2.8864922254844765|  1.0|       0.0|
| 7.616224061531064|  1.0|       0.0|
| 6.791668370371862|  2.0|       0.0|
|    8.426942936449|  5.0|       0.0|
+------------------+-----+----------+
only showing top 5 rows



In [22]:
tep = predictions["prediction","click","impression"]
tep = tep.groupBy("prediction").agg({"click":"sum", "impression":"sum"}).sort("prediction", ascending = False)
[pCTR,click,impression] = [tep.select("prediction").rdd.flatMap(lambda r:r).collect(),
                           tep.select("sum(click)").rdd.flatMap(lambda r:r).collect(),
                           tep.select("sum(impression)").rdd.flatMap(lambda r:r).collect()]
print(scoreClickAUC(click,impression,pCTR))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
0.504435915172
