# Feature engineering

In [1]:
!pip install -q pyspark
!pip install -q ipython-autotime

In [2]:
%load_ext autotime

time: 134 µs (started: 2022-07-08 19:30:48 +00:00)


In [3]:
def download_file_google_drive(id: str, f_name: str, f_save_path: str=""):
    """
    Download file from Google Drive

    Args:
        id (str): Google Drive File ID
        f_name (str): File name
        f_save_path (str): File save path. 
            default: Current working directory
    """
    import os, requests

    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value
        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768
        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"

    f_save_path = os.getcwd() if not f_save_path else f_save_path.rstrip("/")
    destination = f"{f_save_path}/{f_name}"

    session = requests.Session()
    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)
    save_response_content(response, destination)    
   

                
file_id = "1dacdNyWhG2Kqbhkk_SdRqKMzjUGDC_BO"
file_name = 'Model_Improvement_Dataset.csv'
download_file_google_drive(file_id, file_name)

time: 2.3 s (started: 2022-07-08 19:30:48 +00:00)


In [4]:
# start a spark application
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

time: 7.17 s (started: 2022-07-08 19:30:51 +00:00)


In [5]:
# read the data 
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

df = sqlContext.read.csv("/content/Model_Improvement_Dataset.csv", header=True, inferSchema=True)



time: 15.1 s (started: 2022-07-08 19:30:58 +00:00)


In [6]:
from pyspark.sql.types import IntegerType
from math import floor
from pyspark.sql.functions import rand
from pyspark.sql.functions import col

def stratifiedSample(df, N, labelCol="y"):
    ctx = df.groupby(labelCol).count()
    ctx = ctx.withColumn('frac', col("count") / df.count())
    frac = ctx.select("y", "frac").rdd.collectAsMap()
    pos = int(floor(frac[1] * N))
    neg = int(floor(frac[0] * N))
    posDF = df.filter(col(labelCol) == 1).orderBy(rand()).limit(pos)
    negDF = df.filter(col(labelCol) == 0).orderBy(rand()).limit(neg)
    return posDF.unionAll(negDF).orderBy(rand())

df = df.withColumn("y", df["click"].cast(IntegerType()))
xdf = stratifiedSample(df, 1_000_000)

xdf.printSchema()

root
 |-- click: boolean (nullable = true)
 |-- C1: integer (nullable = true)
 |-- banner_pos: integer (nullable = true)
 |-- site_id: string (nullable = true)
 |-- site_domain: string (nullable = true)
 |-- site_category: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_domain: string (nullable = true)
 |-- app_category: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_ip: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- device_type: integer (nullable = true)
 |-- device_conn_type: integer (nullable = true)
 |-- C14: integer (nullable = true)
 |-- C15: integer (nullable = true)
 |-- C16: integer (nullable = true)
 |-- C17: integer (nullable = true)
 |-- C18: integer (nullable = true)
 |-- C19: integer (nullable = true)
 |-- C20: integer (nullable = true)
 |-- C21: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- 

In [7]:
xdf.printSchema()

root
 |-- click: boolean (nullable = true)
 |-- C1: integer (nullable = true)
 |-- banner_pos: integer (nullable = true)
 |-- site_id: string (nullable = true)
 |-- site_domain: string (nullable = true)
 |-- site_category: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_domain: string (nullable = true)
 |-- app_category: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_ip: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- device_type: integer (nullable = true)
 |-- device_conn_type: integer (nullable = true)
 |-- C14: integer (nullable = true)
 |-- C15: integer (nullable = true)
 |-- C16: integer (nullable = true)
 |-- C17: integer (nullable = true)
 |-- C18: integer (nullable = true)
 |-- C19: integer (nullable = true)
 |-- C20: integer (nullable = true)
 |-- C21: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- 

In [8]:
# select the columns 'site_id','site_domain','site_category','app_id','app_domain','app_category' and display them, 
# so that you can analyse the values as they were shown in the segment 

cols = ["site_id", "site_domain", "site_category", "app_id", "app_domain", "app_category"]
xdf.select(*cols).show()


+--------+-----------+-------------+--------+----------+------------+
| site_id|site_domain|site_category|  app_id|app_domain|app_category|
+--------+-----------+-------------+--------+----------+------------+
|5bcf81a2|   9d54950b|     f028772b|ecad2386|  7801e8d9|    07d7df22|
|85f751fd|   c4e18dd6|     50e219e0|7e7baafa|  2347f47a|    0f2161f8|
|a7853007|   7e091613|     f028772b|ecad2386|  7801e8d9|    07d7df22|
|1fbe01fe|   f3845767|     28905ebd|ecad2386|  7801e8d9|    07d7df22|
|85f751fd|   c4e18dd6|     50e219e0|576ea112|  2347f47a|    0f2161f8|
|85f751fd|   c4e18dd6|     50e219e0|e2a1ca37|  2347f47a|    8ded1f7a|
|e151e245|   7e091613|     f028772b|ecad2386|  7801e8d9|    07d7df22|
|6c5b482c|   7687a86e|     3e814130|ecad2386|  7801e8d9|    07d7df22|
|1fbe01fe|   f3845767|     28905ebd|ecad2386|  7801e8d9|    07d7df22|
|85f751fd|   c4e18dd6|     50e219e0|febd1138|  82e27996|    0f2161f8|
|85f751fd|   c4e18dd6|     50e219e0|febd1138|  82e27996|    0f2161f8|
|85f751fd|   c4e18dd

In [9]:
# create 3 new columns in the datafeame named 'pub_id', 'pub_domain', 'pub_category' 
# these columns will have the data from 'site_id','site_domain','site_category'
# Use the withColumn method 

xdf = xdf.withColumn("pub_id", xdf["site_id"])
xdf = xdf.withColumn("pub_domain", xdf["site_domain"])
xdf = xdf.withColumn("pub_category", xdf["site_category"])

time: 96 ms (started: 2022-07-08 19:31:20 +00:00)


In [10]:
# replace the values in the columns 'pub_id', 'pub_domain', 'pub_category' with the values 'app_id','app_domain','app_category'
# the replcement should only happen if the value in site_id is '85f751fd'
# you will need withColumn and when functions

from pyspark.sql import functions as F

xdf = xdf.withColumn("pub_id", F.when(F.col("site_id") == "85f751fd", F.col("app_id")).otherwise(F.col("site_id")))
xdf = xdf.withColumn("pub_domain", F.when(F.col("site_id") == "85f751fd", F.col("app_domain")).otherwise(F.col("site_domain")))
xdf = xdf.withColumn("pub_category", F.when(F.col("site_id") == "85f751fd", F.col("app_category")).otherwise(F.col("site_category")))


time: 202 ms (started: 2022-07-08 19:31:20 +00:00)


In [11]:
# drop all the columns which are now redundunt 
xdf = xdf.drop(*tuple(cols))

time: 54.6 ms (started: 2022-07-08 19:31:21 +00:00)


In [12]:
# check the frequency of each category in device_id. did you get a similar distribution to the one shown in the segment? 
# use groupBy and count
from pyspark.sql.functions import desc

xdf.groupby("device_id").count().sort(desc("count")).show()

+---------+-----+
|device_id|count|
+---------+-----+
| a99f214a|82579|
| c357dbff|   62|
| 0f7c61dc|   51|
| afeffc18|   34|
| 936e92fb|   27|
| cef4c8cc|   12|
| 28dc8687|   11|
| 9af87478|    9|
| d857ffbb|    9|
| 02da5312|    8|
| b09da1c4|    7|
| 987552d1|    7|
| 3cdb4052|    6|
| 03559b29|    6|
| 1168ce02|    5|
| abab24a7|    5|
| 69401309|    4|
| 787d2bb0|    4|
| 2c112522|    4|
| 243bd3ab|    4|
+---------+-----+
only showing top 20 rows

time: 2.81 s (started: 2022-07-08 19:31:21 +00:00)


In [13]:
# combine device_ip and device_type to form a new column which can identify users. 
# name the new column 'user' with data from device_id
xdf = xdf.withColumn("user", xdf["device_id"])

time: 57.5 ms (started: 2022-07-08 19:31:24 +00:00)


In [14]:
# In that column whereever the device_id is missing use a combination of device_ip and device_type
# You will have to use the withcolumn along with a when condition. 
# You will the combination of device id and device ip can be done usin the following code. 
# from pyspark.sql import functions as sf
# sf.concat(sf.col('device_ip'),sf.lit('-'), sf.col('device_model'))

# use this code inside a when condition

xdf = xdf.withColumn("user", F.when(F.col("device_id") == "a99f214a", 
                                    F.concat(F.col("device_ip"), 
                                             F.lit("-"), 
                                             F.col("device_model"))).otherwise(F.col("device_id")))

time: 95.2 ms (started: 2022-07-08 19:31:24 +00:00)


In [15]:
# find the frequency of each category in the user column, save this frequency in a new coulumn in the data frame
# join this dataframe with the earlier df, make sure that when you join all the rows 
# form the original table need to be present in the final table as well.
# use the join table command
xdf.groupby("user").count().sort(desc("count")).show()


+-----------------+-----+
|             user|count|
+-----------------+-----+
|af62faf4-3bb1ddd7|   96|
|285aa37d-3bb1ddd7|   92|
|2f323f36-3bb1ddd7|   92|
|6394f6f6-3bb1ddd7|   91|
|d90a7774-3bb1ddd7|   88|
|af9205f9-3bb1ddd7|   88|
|6b9769f2-8a4875bd|   87|
|009a7861-3bb1ddd7|   86|
|930ec31d-3bb1ddd7|   84|
|431b3174-8a4875bd|   69|
|         c357dbff|   62|
|         0f7c61dc|   51|
|6394f6f6-03683bd4|   38|
|af9205f9-cad4c01d|   35|
|         afeffc18|   34|
|6b9769f2-1f0bc64f|   33|
|af62faf4-cad4c01d|   33|
|af62faf4-03683bd4|   32|
|930ec31d-cad4c01d|   30|
|009a7861-03683bd4|   30|
+-----------------+-----+
only showing top 20 rows

time: 3.73 s (started: 2022-07-08 19:31:24 +00:00)


In [16]:
# combine user and hour column to make a new columns user-hour. 
# again add a columns with the frequency of occurance each user-hour combiation

xdf = xdf.withColumn("user_hour", F.concat(F.col("user"), 
                                           F.lit("-"), 
                                           F.col("hour")))

time: 171 ms (started: 2022-07-08 19:31:27 +00:00)


In [17]:
# drop all redundent columns

redundent_cols = ["device_id", "device_ip", "device_model", "user", "month", "hour"]

xdf = xdf.drop(*tuple(redundent_cols))


time: 43 ms (started: 2022-07-08 19:31:28 +00:00)


In [18]:
# find the number of catagories in each columns 
# use the countdistinct method
# create a list of all columns and then loop through each column counting the distinct values in it. 

from pyspark.sql.functions import col, countDistinct

xdf.agg(*(countDistinct(col(c)).alias(c) for c in xdf.columns)).show()

# here * instructs the add function to accept whatever number of columns are inside the brakets. 


+-----+---+----------+-----------+----------------+----+---+---+---+---+---+---+---+---------+---+---+------+----------+------------+---------+
|click| C1|banner_pos|device_type|device_conn_type| C14|C15|C16|C17|C18|C19|C20|C21|dayofweek|day|  y|pub_id|pub_domain|pub_category|user_hour|
+-----+---+----------+-----------+----------------+----+---+---+---+---+---+---+---+---------+---+---+------+----------+------------+---------+
|    2|  7|         7|          4|               4|1722|  8|  9|399|  4| 64|154| 60|        7| 10|  2|  2837|      1427|          40|    97154|
+-----+---+----------+-----------+----------------+----+---+---+---+---+---+---+---+---------+---+---+------+----------+------------+---------+

time: 6.8 s (started: 2022-07-08 19:31:28 +00:00)


In [19]:
# drop colums which have lot of dictinct catagories. 


time: 1.17 ms (started: 2022-07-08 19:31:35 +00:00)


# Algorithm Building

Earlier in the module, Jaidev has performed model building exercise. You can refer to his notebooks for help..

In [20]:
# import all the necessary libraries 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

time: 310 ms (started: 2022-07-08 19:31:35 +00:00)


In [21]:
# create a list of catagorical columns wich are represented by integers. these will not be sent through the string indexer

categorical_cols = ["C1", "banner_pos", "device_type", "device_conn_type"]
xdf.select(*categorical_cols).show()

+----+----------+-----------+----------------+
|  C1|banner_pos|device_type|device_conn_type|
+----+----------+-----------+----------------+
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         1|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               2|
|1005|         0|          1|               2|
|1005|         1|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1005|         1|          1|               0|
|1005|         0|          1|               0|
|1005|         0|          1|               0|
|1002|       

In [22]:
# if you remember form the earlier model building exercise the column C20 has negative values. 
# replace the negative values with 0. you can use the same UDF made by Jaidev. 

posMapper = udf(lambda x: 0 if x < 0 else x)
xdf = xdf.withColumn("C20", posMapper(xdf["C20"]))
xdf = xdf.withColumn("C20", xdf["C20"].cast(IntegerType()))

time: 97.5 ms (started: 2022-07-08 19:31:37 +00:00)


In [23]:
# run the string indexer on the columns which are categorical but are represented by strings.

stringCols = ["pub_id", "pub_domain", "pub_category", "user_hour"]
for c in stringCols:
    outCol = c + "_ix"
    si = StringIndexer(inputCol=c, outputCol=outCol)
    xdf = si.fit(xdf).transform(xdf)

time: 18.1 s (started: 2022-07-08 19:31:37 +00:00)


In [24]:
# add the resultant columns from string indexer into the catagorical columns list. 
# make sure that you have the right column name for C20 as it has beed processed separately. 

categorical_cols.extend([f"{c}_ix" for c in ["pub_id", "pub_domain", "pub_category", "user_hour"]])
categorical_cols.extend([f"C{i}" for i in range(14, 22)])

time: 2.82 ms (started: 2022-07-08 19:31:55 +00:00)


In [25]:
# run all the categorical columns through the one hot encoder 
ohe = OneHotEncoder(inputCols=categorical_cols, outputCols=[f"{c}_enc" for c in categorical_cols])
xdf = ohe.fit(xdf).transform(xdf)

time: 9.89 s (started: 2022-07-08 19:31:55 +00:00)


In [26]:
# use the vector assember to combine all the encoded columns and the continuous variable columns 

train_cols = [f"{c}_enc" for c in categorical_cols]
train_cols.extend(["dayofweek", "day"])

assembler = VectorAssembler(inputCols=train_cols, outputCol='features')
xdf = assembler.transform(xdf)

time: 2.09 s (started: 2022-07-08 19:32:05 +00:00)


In [27]:
# split the data into training and testing data set

train, test = xdf.randomSplit([0.7, 0.3], seed=42)

time: 40.4 ms (started: 2022-07-08 19:32:07 +00:00)


In [28]:
# train model in the training dataset 

lr = LogisticRegression(featuresCol="features", labelCol="y")
model = lr.fit(train)

time: 6min 51s (started: 2022-07-08 19:32:07 +00:00)


In [29]:
# use the model to predict the class labels of test data. 

test_results = model.evaluate(test)

time: 11.5 s (started: 2022-07-08 19:38:58 +00:00)


In [30]:
# analyise the predictions 

preds_df = test_results.predictions
preds_df.show()

+-----+----+----------+-----------+----------------+-----+---+---+----+---+----+------+---+---------+---+---+--------+----------+------------+--------------------+---------+-------------+---------------+------------+-------------------+--------------+---------------+--------------------+-------------------+------------------+-------------------+--------------------+--------------------+------------------+------------------+-------------------+-------------+-------------------+--------------------+-----------------+--------------------+--------------------+--------------------+----------+
|click|  C1|banner_pos|device_type|device_conn_type|  C14|C15|C16| C17|C18| C19|   C20|C21|dayofweek|day|  y|  pub_id|pub_domain|pub_category|           user_hour|pub_id_ix|pub_domain_ix|pub_category_ix|user_hour_ix|             C1_enc|banner_pos_enc|device_type_enc|device_conn_type_enc|      pub_id_ix_enc| pub_domain_ix_enc|pub_category_ix_enc|    user_hour_ix_enc|             C14_enc|           C15_e

In [31]:
preds_df.describe()

DataFrame[summary: string, C1: string, banner_pos: string, device_type: string, device_conn_type: string, C14: string, C15: string, C16: string, C17: string, C18: string, C19: string, C20: string, C21: string, dayofweek: string, day: string, y: string, pub_id: string, pub_domain: string, pub_category: string, user_hour: string, pub_id_ix: string, pub_domain_ix: string, pub_category_ix: string, user_hour_ix: string, prediction: string]

time: 31.3 s (started: 2022-07-08 19:39:24 +00:00)


In [32]:
# Write the prediction data frame so that when you need it you can directly call that dataframe. No need to train again
# use you AWS acccout to create an S3 bucket 

# preds_df.toPandas().to_csv("test_results.csv")

time: 1.23 ms (started: 2022-07-08 19:39:55 +00:00)


# Finding the right threshold



In [33]:
# Read the data from the result data frame you created. 

preds_df.show()

+-----+----+----------+-----------+----------------+-----+---+---+----+---+----+------+---+---------+---+---+--------+----------+------------+--------------------+---------+-------------+---------------+------------+-------------------+--------------+---------------+--------------------+-------------------+------------------+-------------------+--------------------+--------------------+------------------+------------------+-------------------+-------------+-------------------+--------------------+-----------------+--------------------+--------------------+--------------------+----------+
|click|  C1|banner_pos|device_type|device_conn_type|  C14|C15|C16| C17|C18| C19|   C20|C21|dayofweek|day|  y|  pub_id|pub_domain|pub_category|           user_hour|pub_id_ix|pub_domain_ix|pub_category_ix|user_hour_ix|             C1_enc|banner_pos_enc|device_type_enc|device_conn_type_enc|      pub_id_ix_enc| pub_domain_ix_enc|pub_category_ix_enc|    user_hour_ix_enc|             C14_enc|           C15_e

In [34]:
# Create a new data frame with only the columns y and probabilities 
final_df = preds_df["y", "probability"]


time: 43.9 ms (started: 2022-07-08 19:40:11 +00:00)


In [35]:
final_df.show()

+---+--------------------+
|  y|         probability|
+---+--------------------+
|  0|[0.99999999994861...|
|  0|           [1.0,0.0]|
|  0|[0.99999999999986...|
|  0|[0.99999999999992...|
|  0|[0.99999999966008...|
|  0|[0.99999999999997...|
|  0|[0.99999999999997...|
|  0|[0.99998693153855...|
|  0|[0.99999999998575...|
|  0|[7.84554577197792...|
|  0|[0.99999999998868...|
|  0|[0.99999811044209...|
|  0|[2.61060966587335...|
|  0|[3.15017547741224...|
|  0|[0.99999999999999...|
|  0|[0.99999999999721...|
|  0|[0.99999999999721...|
|  0|[0.99999999999998...|
|  0|[0.99999999999970...|
|  0|[0.99999999999999...|
+---+--------------------+
only showing top 20 rows

time: 14.5 s (started: 2022-07-08 19:40:11 +00:00)


In [36]:
# the probability is a dense vector column. 
# You will haave to find a way to access the value of the probability refering to the positive class. 
# create a function to access the probability of the positive class form the 'probability' column

from pyspark.sql.functions import udf, when
from pyspark.sql.types import FloatType

element_extrac = udf(lambda v:float(v[1]),FloatType())

time: 2.65 ms (started: 2022-07-08 19:40:25 +00:00)


In [37]:
# Now you have the probabilities, your task is to add a class column. 
# To create this column, compare the probabilities with the threshold and decide the class. 
# You can make a loop which will repeat the same action for different thresholds
# Use the filter and count functions to find the true positives, false positives, false negatives. (inside the loop)
# Using these values, find the precision and recall for diffrent thresholds. (inside the loop)


time: 1.28 ms (started: 2022-07-08 19:40:26 +00:00)


In [38]:
# Use the precision and recall to make a choice of threshold. 



time: 779 µs (started: 2022-07-08 19:40:26 +00:00)


## Practice Questions

In [39]:
file_id = "1QgfDsksChGJ5-XmDk7-sn9le2up7ThnF"
file_name = 'segment1.csv'
download_file_google_drive(file_id, file_name)

time: 1.01 s (started: 2022-07-08 19:40:26 +00:00)


In [40]:
df = sqlContext.read.csv("/content/segment1.csv", header=True, inferSchema=True)

time: 705 ms (started: 2022-07-08 19:40:27 +00:00)


In [41]:
df.show()

+-----+----+----------+--------+-----------+-------------+--------+----------+------------+---------+---------+------------+-----------+----------------+-----+---+---+----+---+----+------+---+-----+---------+---+----+
|click|  C1|banner_pos| site_id|site_domain|site_category|  app_id|app_domain|add_category|device_id|device_ip|device_model|device_type|device_conn_type|  C14|C15|C16| C17|C18| C19|   C20|C21|month|dayofweek|day|hour|
+-----+----+----------+--------+-----------+-------------+--------+----------+------------+---------+---------+------------+-----------+----------------+-----+---+---+----+---+----+------+---+-----+---------+---+----+
|false|1005|         1|5ee41ff2|   17d996e6|     f028772b|ecad2386|  7801e8d9|    07d7df22| a99f214a| bd03f834|    daa861e9|          1|               0|20251|320| 50|2323|  0| 687|100081| 48|   10|        2| 22|   8|
|false|1005|         0|dcc019de|   250469f8|     3e814130|ecad2386|  7801e8d9|    07d7df22| a99f214a| a84d1037|    ecb851b2|    

In [42]:
df = sqlContext.read.csv("/content/segment1.csv", header=True, inferSchema=True)

si  = StringIndexer(inputCol='C14', outputCol='C14_si')
df = si.fit(df).transform(df)
ohe = OneHotEncoder(inputCols=['C14_si'], outputCols=['C14_enc'])
df= ohe.fit(df).transform(df)
df.select('C14_enc').show()

+-----------------+
|          C14_enc|
+-----------------+
| (964,[34],[1.0])|
|  (964,[7],[1.0])|
|  (964,[3],[1.0])|
|(964,[107],[1.0])|
|  (964,[6],[1.0])|
| (964,[31],[1.0])|
|(964,[153],[1.0])|
| (964,[10],[1.0])|
|  (964,[3],[1.0])|
|(964,[139],[1.0])|
|  (964,[8],[1.0])|
| (964,[57],[1.0])|
|(964,[205],[1.0])|
| (964,[11],[1.0])|
|  (964,[2],[1.0])|
| (964,[15],[1.0])|
| (964,[14],[1.0])|
|(964,[367],[1.0])|
|  (964,[6],[1.0])|
|  (964,[2],[1.0])|
+-----------------+
only showing top 20 rows

time: 1.31 s (started: 2022-07-08 19:40:28 +00:00)


In [43]:
df = sqlContext.read.csv("/content/segment1.csv", header=True, inferSchema=True)

df = df.withColumn("y", df["click"].cast(IntegerType()))

posMapper = udf(lambda x: 0 if x < 0 else x)
df = df.withColumn('C20_1', posMapper(df['C20']))
df = df.withColumn("C20_1int", df['C20_1'].cast(IntegerType()))


col = ['C1','C14','C15','C16','C17','C18','C19','C20_1int','C21']
ohe = OneHotEncoder(inputCols=col, outputCols=[c +'enc' for c in col])
df= ohe.fit(df).transform(df)

trainCols = [c+'enc' for c in col]

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=trainCols, outputCol='features')
df = assembler.transform(df)

train, test = df.randomSplit([0.8, 0.2], seed=12345)

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='y')
model = lr.fit(train)

result_test = model.evaluate(test)

result_test.areaUnderROC

0.6589750007219658

time: 3min 48s (started: 2022-07-08 19:40:29 +00:00)


In [44]:
df = sqlContext.read.csv("/content/segment1.csv", header=True, inferSchema=True)

df = df.withColumn("y", df["click"].cast(IntegerType()))

posMapper = udf(lambda x: 0 if x < 0 else x)
df = df.withColumn('C20_1', posMapper(df['C20']))
df = df.withColumn("C20_1int", df['C20_1'].cast(IntegerType()))


col = ['C1','C14','C15','C16','C17','C18','C19','C20_1int','C21']
ohe = OneHotEncoder(inputCols=col, outputCols=[c +'enc' for c in col])
df= ohe.fit(df).transform(df)

trainCols = [c+'enc' for c in col]

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=trainCols, outputCol='features')
df = assembler.transform(df)

train, test = df.randomSplit([0.8, 0.2], seed=12345)

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='y')
model = lr.fit(train)

result_test = model.evaluate(test)

result_test.recallByLabel

[0.9770992366412213, 0.06885245901639345]

time: 3min 42s (started: 2022-07-08 19:44:18 +00:00)


In [45]:
df = sqlContext.read.csv("/content/segment1.csv", header=True, inferSchema=True)

df = df.withColumn("y", df["click"].cast(IntegerType()))

posMapper = udf(lambda x: 0 if x < 0 else x)
df = df.withColumn('C20_1', posMapper(df['C20']))
df = df.withColumn("C20_1int", df['C20_1'].cast(IntegerType()))

col = ['C1','C14','C15','C16','C17','C18','C19','C20_1int','C21']
ohe = OneHotEncoder(inputCols=col, outputCols=[c +'enc' for c in col])
df= ohe.fit(df).transform(df)

trainCols = [c+'enc' for c in col]

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=trainCols, outputCol='features')
df = assembler.transform(df)

train, test = df.randomSplit([0.8, 0.2], seed=12345)

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='y')
model = lr.fit(train)

result_test = model.evaluate(test)

result_test.areaUnderROC

0.6589750007219658

time: 3min 39s (started: 2022-07-08 19:48:00 +00:00)


In [46]:
%unload_ext autotime