In [2]:
import pandas as pd
#import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
import os

In [44]:
ad_feature=pd.read_csv('data/adFeature.csv')
train=pd.read_csv('data/train.csv')
predict=pd.read_csv('data/test.csv')
user_feature=pd.read_csv('data/userFeature.csv')


In [45]:
train.loc[train['label']==-1,'label']=0
predict['label']=-1
data=pd.concat([train,predict])
data=pd.merge(data,ad_feature,on='aid',how='left')
data=pd.merge(data,user_feature,on='uid',how='left')
data=data.fillna('-1')

In [46]:
train=data[data.label!=-1]
train_y=train.pop('label')

In [47]:
test=data[data.label==-1]
res=test[['aid','uid']]
test=test.drop('label',axis=1)

In [79]:
class FeatureExtractor(object):
    def __init__(self):
        pass

    def fit(self, X_df, y):
        return self

    def transform(self, X_df):
        X_df_new = X_df[0].copy()
        data_new=X_df[1].copy()

        X_df_new=X_df_new.fillna('-1')#replace missing values NaN
        data_new=data_new.fillna('-1')

        one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','house','os','ct','marriageStatus','advertiserId','campaignId', 'creativeId',
       'adCategoryId', 'productId', 'productType']#features with only one scalar
        vector_feature=['appIdAction','appIdInstall','interest1','interest2','interest3','interest4','interest5','kw1','kw2','kw3','topic1','topic2','topic3']#vector features

        X_df_new=labelEncoder(data_new,X_df_new,one_hot_feature)
        data_new=labelEncoder(data_new,data_new,one_hot_feature)#normalize features

        X_sparse=OneHot(data_new,X_df_new,one_hot_feature)
        X_sparse=Vectorize(data_new,X_df_new,vector_feature,X_sparse)

        return X_sparse

def labelEncoder(data,X_df,one_hot_feature):#normalize features
    le=LabelEncoder()
    for feature in one_hot_feature:
        try:
            le.fit(data[feature].apply(int))
            #data[feature]=le.transform(data[feature].apply(int))
            X_df[feature]=le.transform(X_df[feature].apply(int))
        except:
            le.fit(data[feature])
            #data[feature]=le.transform(data[feature])
            X_df[feature]=le.transform(X_df[feature])
    return X_df

def OneHot(data,X_df,one_hot_feature):
    enc=OneHotEncoder()
    X_sparse=X_df[['creativeSize']]
    for feature in one_hot_feature:
        enc.fit(data[feature].values.reshape(-1, 1))
        X_onehot=enc.transform(X_df[feature].values.reshape(-1, 1))
        X_sparse=sparse.hstack((X_sparse, X_onehot))
    print('one hot finished')
    return X_sparse

def Vectorize(data,X_df,vector_feature,X_sparse):
    cv=CountVectorizer()
    for feature in vector_feature:
        cv.fit(data[feature])
        X_vec=cv.transform(X_df[feature])
        X_sparse=sparse.hstack((X_sparse,X_vec))
    print('cv finished')
    return X_sparse


In [163]:
from __future__ import division

from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import lightgbm as lgb


class Classifier(BaseEstimator):
    def __init__(self):
        
        self.model = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=1500, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=-1
    )

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

In [74]:
from sklearn.pipeline import make_pipeline
model = make_pipeline(FeatureExtractor(), Classifier())

In [None]:
X_df=[]
X_df.append(train)
X_df.append(data)

In [83]:
X_sparse=FeatureExtractor().transform(X_df)

one hot finished
cv finished


In [86]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_sparse, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [87]:
X_test=[]
X_test.append(test)
X_test.append(data)
test_sparse=FeatureExtractor().transform(X_test)

one hot finished
cv finished


In [96]:
test_new=test_sparse.copy()

In [101]:
test_new=test_sparse.tocsr()


In [137]:
neigh.predict(test_new[:30,:])

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0])

In [115]:
model=xgb.XGBClassifier()

In [116]:
X_new=X_sparse.tocsr()

In [138]:
model.fit(X_new[:100000,:],train_y[:100000])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [144]:
model.predict(X_new[100001:100101,:])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [135]:
import scipy
scipy.sparse.save_npz('data/sparse_matrix.npz', X_sparse)

In [136]:
scipy.sparse.save_npz('data/sparse_matrix_test.npz', test_sparse)

In [157]:
import lightgbm as lgb


In [158]:
clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=1500, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=-1
    )

In [162]:
clf.fit(X_new[:100000,:],train_y[:100000])

KeyboardInterrupt: 

In [None]:
clf.predict(X_new[100001:100101])

In [None]:
train_y[100001:100101]