In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
import os
from problem import get_train_data, get_test_data
from joblib import Memory

In [4]:
X_train, y_train = get_train_data()
X_test, y_test = get_test_data()

# The model to submit
The submission consists of two files: feature_extractor.py which defines a FeatureExtractor class, and classifier.py which defines a Classifier class

These two classes in the following only give you an easy example, it helps you to understand how the challenge runs, you need to improve them by selecting the most efficent features and models with proper parameters.

We use FeatureExtrator to transform and select data so that they can be used directly for the training model. As there are some categorical features, we need to encode them firstly. In the following FeatureExtractor class, in order to accelerate the running time, we keep only one dimensionnal feature, and use OneHotEncoder which creates a binary column for each category and returns a sparse matrix or dense array (For multidimentional features, you can use the defined function Vectorize directly to get the array). 

Then we define Classifier( ) class to create the training model. We've provided an easy model LogisticRegression( ), you can modify the model in this class to have a better output

In [8]:
class FeatureExtractor(object):
    def __init__(self):
        pass

    def fit(self, X_df, y):
        return self

    def transform(self, X_df):
        return _transform(X_df)

def _transform(X_df):
    X_df_new = X_df.copy()
    train, _ = get_train_data()
    test, _ = get_test_data()
    data_new = pd.concat([train, test])
    
    X_df_new = X_df_new.fillna('-1')  # replace missing values NaN
    data_new = data_new.fillna('-1')

    one_hot_feature = ['LBS', 'age', 'carrier', 'consumptionAbility',
        'education','gender', 'house', 'os', 'ct', 'marriageStatus',
        'advertiserId', 'campaignId', 'creativeId', 'adCategoryId',
        'productId', 'productType']  # features with only one scalar
        
    vector_feature = ['appIdAction', 'appIdInstall', 'interest1',
        'interest2', 'interest3', 'interest4', 'interest5', 'kw1',
        'kw2', 'kw3', 'topic1', 'topic2', 'topic3']  # vector features

    X_df_new = labelEncoder(data_new, X_df_new, one_hot_feature)
    data_new = labelEncoder(data_new, data_new, one_hot_feature)  # normalize features

    X_sparse = OneHot(data_new, X_df_new, one_hot_feature)
    #X_sparse = Vectorize(data_new, X_df_new, vector_feature, X_sparse)

    return X_sparse.tocsr()


def labelEncoder(data, X_df, one_hot_feature):  # normalize features
    le = LabelEncoder()
    for feature in one_hot_feature:
        try:
            le.fit(data[feature].apply(int))
            X_df[feature] = le.transform(X_df[feature].apply(int))

        except:
            le.fit(data[feature])
            X_df[feature] = le.transform(X_df[feature])

    return X_df


def OneHot(data, X_df, one_hot_feature):
    enc = OneHotEncoder(categories='auto')
    X_sparse = X_df[['creativeSize']]
    for feature in one_hot_feature:
        enc.fit(data[feature].values.reshape(-1, 1))
        X_onehot = enc.transform(X_df[feature].values.reshape(-1, 1))
        X_sparse = sparse.hstack((X_sparse, X_onehot))
    print('one hot finished')
    return X_sparse


def Vectorize(data, X_df, vector_feature, X_sparse):
    cv = CountVectorizer()
    for feature in vector_feature:
        cv.fit(data[feature])
        X_vec = cv.transform(X_df[feature])
        X_sparse = sparse.hstack((X_sparse, X_vec))
    print('cv finished')
    return X_sparse


In [26]:

from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


class Classifier(BaseEstimator):
    def __init__(self):
        self.model = make_pipeline(StandardScaler(with_mean=False), LogisticRegression())

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

In [28]:
model = make_pipeline(FeatureExtractor(), Classifier())

In [None]:
model.fit(X_train, y_train)

In [None]:
model.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred[:,0])

# Sumbit on Ramp

Before submitting the challenge, you can test your model locally using the test data we've provided.

Run the following cell to download ramp-workflow package 

In [None]:
! pip install ramp-workflow

Modifiy the two files classifier.py and feature_extractor.py in the folder submissions/starting_kit, and run ramp_test_submission

In [None]:
!ramp_test_submission --submission starting_kit

Once the score is good enough, you can submit your codes on the website ramp. Go to your sandbox and copy-paste the two files or upload them from local local files. 

# Hints

For the FeatureExtractor, after encoding all features by the functions OneHot( ) and Vectorize( ), you can get a sparse matrix containing the complete informations for whole features. The dimension of features can be large because of multidimentional features.  Selection of important features for sparse matrix is needed in this case. RandomForest or Lightgbm are good choices for selection. 

Origin features after encoding are not good enough to train a great model, you can also add some statistical features such as the frequency of one uid appearing in training data. These statistical features may have a big relerance with labels.

Choosing a good model is also important. You can try some more efficient model such as ffm, lightgbm, or xgboost, even using some model on deep learning.