# Starter code for the datathon

In [1]:
# -*- coding: utf-8 -*-

import gc
import numpy as np
import pandas as pd
import platform
import random
import socket
import sys
import os
import re

#### Define the paths for the input and output files.

In [2]:
# Here we define the path to the data (input and output). Change it at will.
if platform.system() == 'Windows':
    DATA_PATH = 'C:\\Users\\andreu.sancho\\Documents\\datathon\\data\\'
    OUTPUT_PATH = 'C:\\Users\\andreu.sancho\\Documents\\datathon\\data\\'
else:
    DATA_PATH = "/home/datascience/datathon/data/"
    OUTPUT_PATH = "/home/datascience/datathon/data/"

#### Define the file names.

In [3]:
# Define the names of the input files to utilize.
TRAINING_SET_FILE = "training_set.csv"
EXTRA_TRAINING_SET_FILE = "extra_training_set.csv"
TEST_SET_FILE = "test_set.csv"
# Output file. Change it at will. Note that server expects a '.csv.gzip' extension!
PREDICTION_FILE = "baseline_prediction.csv.gzip"

#### Define the evaluation connection to the server. Don't worry too much about this code snipped.

In [19]:
def evaluate_prediction(client_hash: str, pred_file: str, host: str = "127.0.0.1", port: int = 7777, max_buffer_size: int = 4096) -> str:
    """ It connects to the server, sends the prediction file, and obtains the result from the evaluation server. """
    def convert_to_bytes(no: int) -> bytearray:
        result = bytearray()
        result.append(no & 255)
        for i in range(3):
            no = no >> 8
            result.append(no & 255)
        return result
    # Check that the `client_hash` is valid.
    ch_regex = re.compile('^ch([0|1]){3}\-190409$', re.IGNORECASE)
    if not re.match(ch_regex, client_hash.lower()):
        print("> Error: unknown client hash!")
        sys.exit()
    soc = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    score = 'Not a valid submission!'
    try:
        soc.connect((host, port))
    except:
        print("> Connection error!")
        sys.exit()
    # Check that the header is correct.
    tmp_df = pd.read_csv(pred_file, nrows=1, compression='gzip')
    if ('case_id' in tmp_df.columns) and ('prediction' in tmp_df.columns):
        print("> Sending the predictions to the Evaluation Server. It may take several minutes... Good luck!")
        # Send the client hash to the server.
        ack_msg = ""
        soc.sendall(client_hash.lower().encode('utf8'))
        ack_msg = soc.recv(max_buffer_size).decode("utf8")
        if ack_msg != 'ack':
            print('> Server did not return an ACK!')
        # Send the bytes the file weights.
        length = os.path.getsize(pred_file)
        ack_msg = ""
        soc.sendall(convert_to_bytes(length)) # has to be 4 bytes
        ack_msg = soc.recv(max_buffer_size).decode("utf8")
        if ack_msg != 'ack':
            print('> Server did not return an ACK!')
        # Send the actual binary file.
        with open(pred_file, 'rb') as infile:
            d = infile.read(max_buffer_size)
            while d:
                ack_msg = ""
                soc.send(d)
                ack_msg = soc.recv(max_buffer_size).decode("utf8")
                d = infile.read(max_buffer_size)
        # Receive the response from the server.
        score = soc.recv(max_buffer_size).decode("utf8")
        print("> Evaluation Server says that the F1 score is", score)
        soc.close()
    else:
        print("> File ", pred_file, " has missing fields! Ensure that 'case_id' and 'prediction' are in.")
    return score

#### Read training and test sets. 

In [5]:
training_data = pd.read_csv(DATA_PATH + TRAINING_SET_FILE)
extra_training_data = pd.read_csv(DATA_PATH + EXTRA_TRAINING_SET_FILE)
test_data = pd.read_csv(DATA_PATH + TEST_SET_FILE)

The training data set has several columns in it --more precisely it has 245 columns in total. Most of these columns are either binary or integer. Notice that some may be useful and some not. Let's see an example: 

In [6]:
training_data.head()

Unnamed: 0,date,cookie_id,visit_number,user_id,device_Mobile_Phone,device_Television,channel_Display,channel_Email,channel_Paid_Search,channel_Push_Notifications,...,page_views_search_filter_fuel,page_views_search_filter_region_level2,page_views_search_filter_transmission,page_views_search_filter_user_role_id,page_views_search_filter_version,page_views_send_buyer_phone_to_seller,page_views_send_buyer_phone_to_seller_later,page_views_terms_and_conditions,label,confidence
0,2019-02-03,14890,42,8498,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0.958484
1,2019-02-03,187498,2,2,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,2,0.99677
2,2019-01-31,54206,143,1617,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0.946318
3,2019-01-16,54381,70,1715,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0.963148
4,2019-01-19,14891,15,8499,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0.997309


It is important to highlight the two following columns:
* **label** which is the output variable to predict -the customer type.
* **confidence** which is the level of trust of such label.

Note that the test data set does not contain such features. The aim, thus, is to **predict the 'label' feature**.

In [7]:
training_data['label'].value_counts()

10    250
9     250
7     250
6     250
4     250
3     250
2     250
1     250
Name: label, dtype: int64

An extra bunch of training data has been provided. In there we may find clues to obtain nice accuracies, but this extra data set is not as clean as the original training set and may include inconsistencies.  

In [8]:
extra_training_data.head()

Unnamed: 0,date,cookie_id,visit_number,user_id,device_Mobile_Phone,device_Television,channel_Display,channel_Email,channel_Paid_Search,channel_Push_Notifications,...,page_views_search_filter_fuel,page_views_search_filter_region_level2,page_views_search_filter_transmission,page_views_search_filter_user_role_id,page_views_search_filter_version,page_views_send_buyer_phone_to_seller,page_views_send_buyer_phone_to_seller_later,page_views_terms_and_conditions,label,confidence
0,2019-02-13,364213,1,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,10,0.02636
1,2019-01-24,46825,207,26577,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-999,0.0
2,2019-01-20,103899,220,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-999,0.0
3,2019-01-21,28617,47,16285,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-999,0.0
4,2019-02-02,54379,62,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0.626416


The test data set also contains several variables --it has 240 in total. Note that **'label' is not part of the test data set** as it is the feature to be predicted! Also, a crucial column is **case_id** as is the identifier used by the validation server. We will see this later.

In [9]:
test_data.head()

Unnamed: 0,case_id,device_Mobile_Phone,device_Television,channel_Display,channel_Email,channel_Paid_Search,channel_Push_Notifications,entry_page_account_dashboard,entry_page_ad_deletion_confirmation,entry_page_ad_deletion_form,...,page_views_search_filter_car_places,page_views_search_filter_color,page_views_search_filter_fuel,page_views_search_filter_region_level2,page_views_search_filter_transmission,page_views_search_filter_user_role_id,page_views_search_filter_version,page_views_send_buyer_phone_to_seller,page_views_send_buyer_phone_to_seller_later,page_views_terms_and_conditions
0,0,1,0,0,0,0,0,0,0,0,...,0,0,2,0,4,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Before anything else, it is recommended to perform some exploratory data analysis prior to start the modeling stage. For instance, check if something is useful in the extra training data set.

### Perform a simple prediction and submit the file to the server.

Now we will perform a simple random prediction and send the data to the validation server. Recall that **the output file structure must be identical to this one otherwise the server will not return anything**. The correct format is the following:
* **case_id** an integer feature taken from *test_set['case_id']*, and
* **prediction** an integer with the predictions.

In [10]:
# Generate the predictions randomly using the training set labels as population.
random.seed(41)
predictions = random.choices(training_data['label'].unique(), k=len(test_data))

The below code is **critical**: the prediction data frame **must include the exact fields** *case_id* and *prediction*.

In [11]:
prediction_df = pd.DataFrame({'case_id': test_data['case_id'].values, 'prediction': predictions})

In [12]:
prediction_df['prediction'].value_counts()

3     31517
2     31505
1     31329
4     31300
10    31215
9     31196
6     30993
7     30945
Name: prediction, dtype: int64

In [13]:
# Store it to disk. This is necessary for submitting the predictions to the evaluation server.
prediction_df.to_csv(OUTPUT_PATH + PREDICTION_FILE, index=False, sep=",", compression='gzip')

In [21]:
evaluate_prediction(client_hash='ch000-190409', pred_file=OUTPUT_PATH + PREDICTION_FILE, host="127.0.0.1", port=8787)

> Sending the predictions to the Evaluation Server. It may take several minutes... Good luck!
> Evaluation Server says that the F1 score is 0.08047


'0.08047'