In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
import os
from problem import get_train_data, get_test_data
from joblib import Memory

In [2]:
X_train, y_train = get_train_data()
X_test, y_test = get_test_data()

In [3]:
class FeatureExtractor(object):
    def __init__(self):
        pass

    def fit(self, X_df, y):
        return self

    def transform(self, X_df):
        return _transform(X_df)

def _transform(X_df):
    X_df_new = X_df.copy()
    train, _ = get_train_data()
    test, _ = get_test_data()
    data_new = pd.concat([train, test])
    
    X_df_new = X_df_new.fillna('-1')  # replace missing values NaN
    data_new = data_new.fillna('-1')

    one_hot_feature = ['LBS', 'age', 'carrier', 'consumptionAbility',
        'education','gender', 'house', 'os', 'ct', 'marriageStatus',
        'advertiserId', 'campaignId', 'creativeId', 'adCategoryId',
        'productId', 'productType']  # features with only one scalar
        
    vector_feature = ['appIdAction', 'appIdInstall', 'interest1',
        'interest2', 'interest3', 'interest4', 'interest5', 'kw1',
        'kw2', 'kw3', 'topic1', 'topic2', 'topic3']  # vector features

    X_df_new = labelEncoder(data_new, X_df_new, one_hot_feature)
    data_new = labelEncoder(data_new, data_new, one_hot_feature)  # normalize features

    X_sparse = OneHot(data_new, X_df_new, one_hot_feature)
    X_sparse = Vectorize(data_new, X_df_new, vector_feature, X_sparse)

    return X_sparse.tocsr()


def labelEncoder(data, X_df, one_hot_feature):  # normalize features
    le = LabelEncoder()
    for feature in one_hot_feature:
        try:
            le.fit(data[feature].apply(int))
            X_df[feature] = le.transform(X_df[feature].apply(int))

        except:
            le.fit(data[feature])
            X_df[feature] = le.transform(X_df[feature])

    return X_df


def OneHot(data, X_df, one_hot_feature):
    enc = OneHotEncoder(categories='auto')
    X_sparse = X_df[['creativeSize']]
    for feature in one_hot_feature:
        enc.fit(data[feature].values.reshape(-1, 1))
        X_onehot = enc.transform(X_df[feature].values.reshape(-1, 1))
        X_sparse = sparse.hstack((X_sparse, X_onehot))
    print('one hot finished')
    return X_sparse


def Vectorize(data, X_df, vector_feature, X_sparse):
    cv = CountVectorizer()
    for feature in vector_feature:
        cv.fit(data[feature])
        X_vec = cv.transform(X_df[feature])
        X_sparse = sparse.hstack((X_sparse, X_vec))
    print('cv finished')
    return X_sparse


In [4]:
from __future__ import division

from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import lightgbm as lgb


class Classifier(BaseEstimator):
    def __init__(self):
        
        self.model = lgb.LGBMClassifier(
            boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
            max_depth=10, n_estimators=15, objective='binary',
            subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
            learning_rate=0.05, min_child_weight=50, random_state=2018, 
            n_jobs=-1, class_weight='balanced')

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

In [5]:
fe = FeatureExtractor()
X_train_sparse = fe.transform(X_train)
X_test_sparse = fe.transform(X_test)

________________________________________________________________________________
[Memory] Calling __main__--home-wangsun-Documents-4A_DS-Big Data Camp-group project-Social_Ads-__ipython-input__._transform...
_transform(        advertiserId  campaignId  creativeId  creativeSize  adCategoryId  \
0                915         994       27461            60            51   
1                915         994       27461            60            51   
2                915         994       27461            60            51   
3                915         994       27461            60            51   
4                915         994       27461            60            51   
5                915         994       27461            60            51   
6                915         994       27461            60            51   
7                915         994       27461            60            51   
8               ...)
one hot finished
cv finished


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  # This is added back by InteractiveShellApp.init_path()


______________________________________________________transform - 273.4s, 4.6min
________________________________________________________________________________
[Memory] Calling __main__--home-wangsun-Documents-4A_DS-Big Data Camp-group project-Social_Ads-__ipython-input__._transform...
_transform(        advertiserId  campaignId  creativeId  creativeSize  adCategoryId  \
0             133292      464828     1334609            22            74   
1             133292      464828     1334609            22            74   
2             133292      464828     1334609            22            74   
3             133292      464828     1334609            22            74   
4             133292      464828     1334609            22            74   
5             133292      464828     1334609            22            74   
6             133292      464828     1334609            22            74   
7             133292      464828     1334609            22            74   
8             13

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  # This is added back by InteractiveShellApp.init_path()


In [17]:
clf = Classifier()
clf.fit(X_train_sparse, y_train)
y_train_pred = clf.predict_proba(X_train_sparse)
y_test_pred = clf.predict_proba(X_test_sparse)

In [20]:
pd.Series(np.argmax(y_train_pred, axis=1)).value_counts()

1    642581
0    281294
dtype: int64

In [21]:
pd.Series(np.argmax(y_test_pred, axis=1)).value_counts()

1    275676
0    120271
dtype: int64

In [5]:
from sklearn.pipeline import make_pipeline
model = make_pipeline(FeatureExtractor(), Classifier())

In [6]:
model.fit(X_train, y_train)
y_train_pred = model.predict_proba(X_train)
y_test_pred = model.predict_proba(X_test)

one hot finished
cv finished
one hot finished
cv finished
one hot finished
cv finished


In [7]:
pd.Series(np.argmax(y_train_pred, axis=1)).value_counts()

1    642581
0    281294
dtype: int64

In [8]:
pd.Series(np.argmax(y_test_pred, axis=1)).value_counts()

1    275676
0    120271
dtype: int64