In [1]:
import os; os.environ['OMP_NUM_THREADS'] = '1'
from contextlib import contextmanager
from functools import partial
from operator import itemgetter
from multiprocessing.pool import ThreadPool
import time
from typing import List, Dict

import keras as ks
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [3]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df['hour'] = pd.to_datetime(df.send_date).dt.hour.astype('uint8')
    df['day'] = pd.to_datetime(df.send_date).dt.day.astype('uint8')
    df['dow'] = pd.to_datetime(df.send_date).dt.dayofweek.astype('uint8')
    print('DateTime Converted')
    df['dow'] = df['dow'].map({0:'MON',1:'TUE',2:'WED',3:'THR',4:'FRI',5:'SAT',6:'SUN'} ).astype(str)
    df['hour'] = df['hour'].map({0:'AM1',1:'AM1',2:'AM1',3:'AM2',4:'AM2',5:'AM2',6:'AM3',7:'AM3',8:'AM3',9:'AM4',10:'AM4',
                                 11:'AM4',12:'PM1',13:'PM1',14:'PM1',15:'PM2',16:'PM2',17:'PM2',18:'PM3',19:'PM3',20:'PM3',
                                 21:'PM4',22:'PM4',23:'PM4'}).astype(str)
    df['day'] = df['day'].map({1:'VEAR',2:'VEAR',3:'VEAR',4:'VEAR',5:'VEAR',6:'EAR',7:'EAR',8:'EAR',9:'EAR',10:'EAR',11:'MID',
                               12:'MID',13:'MID',14:'MID',15:'MID',16:'VMID',17:'VMID',18:'VMID',19:'VMID',20:'VMID',21:'LAT',
                               22:'LAT',23:'LAT',24:'LAT',25:'LAT',26:'VLAT',27:'VLAT',28:'VLAT',29:'VLAT',30:'VLAT',
                               31:'VLAT'}).astype(str)
    df['time'] = df['hour'].fillna('')+ ' ' + df['day'].fillna('')+' ' + df['dow'].fillna('')
    df['subject'] = (df['communication_type']+' '+df['subject'].fillna(''))
    df['email_body'] = ( df['communication_type']+' '+df['email_body'].fillna(''))
    return df[['time','subject','email_body','communication_type']]

In [4]:
def on_field(f: str, *vec) -> Pipeline:
    return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)

In [5]:
def to_records(df: pd.DataFrame) -> List[Dict]:
    return df.to_dict(orient='records')

In [6]:
def fit_predict(xs, y_train) -> np.ndarray:
    X_train, X_test = xs
    config = tf.ConfigProto(intra_op_parallelism_threads=1, use_per_session_threads=1, inter_op_parallelism_threads=1)
    with tf.Session(graph=tf.Graph(), config=config) as sess, timer('fit_predict'):
        ks.backend.set_session(sess)
        model_in = ks.Input(shape=(X_train.shape[1],), dtype='float32', sparse=True)
        out = ks.layers.Dense(192, activation='relu')(model_in)
        out = ks.layers.Dense(64, activation='relu')(out)
        out = ks.layers.Dense(64, activation='relu')(out)
        out = ks.layers.Dense(1, activation='sigmoid')(out)
        model = Model(model_in, out)
        model.compile(loss='binary_crossentropy', optimizer=ks.optimizers.Adam(lr=3e-3))
        for i in range(3):
            with timer(f'epoch {i + 1}'):
                model.fit(x=X_train, y=y_train, batch_size=2**(5 + i), epochs=1, verbose=1)
        return model.predict(X_test)[:, 0]

In [10]:
def main():
    vectorizer = make_union(on_field('subject', Tfidf(max_features=100000, token_pattern='\w+')),
                            on_field('email_body', Tfidf(max_features=100000, token_pattern='\w+', ngram_range=(1, 2))),
                            on_field('communication_type', Tfidf(max_features=10,token_pattern='\w+', ngram_range=(1), binary=True),
                            on_field('time', TFidf(max_feature=100, token_pattern='\w+', ngram_range=(1,3), binary=True)),
                            n_jobs=4)
    #y_scaler = StandardScaler()
    with timer('process train'):
        #path = '/Users/804357/Desktop/MyFiles/Learn/LOM/Data/'
        path = '/Users/Vishy/Files/AVDatahack/LOM/Data/' 
        limtrain = pd.read_csv(path+'train.csv')
        #limtest = pd.read_csv(path+'test.csv')
        camp = pd.read_csv(path+'campaign_data.csv')
        train = pd.merge(limtrain, camp, on='campaign_id')
        #test = pd.merge(limtest, camp, on='campaign_id')
        del limtrain, camp
        print('Data Imported')
        cv = KFold(n_splits=20, shuffle=True, random_state=42)
        train_ids, valid_ids = next(cv.split(train))
        train, valid = train.iloc[train_ids], train.iloc[valid_ids]
        y_train = (train['is_click'].values.reshape(-1, 1))
        print('Starting Data preprocessing')
        X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
        print(f'X_train: {X_train.shape} of {X_train.dtype}')
        del train
    with timer('process valid'):
        X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
    with ThreadPool(processes=4) as pool:
        Xb_train, Xb_valid = [x.astype(np.bool).astype(np.float32) for x in [X_train, X_valid]]
        xs = [[Xb_train, Xb_valid], [X_train, X_valid]] * 2
        y_pred = np.mean(pool.map(partial(fit_predict, y_train=y_train), xs), axis=0)
    y_pred = (y_pred.reshape(-1, 1))[:, 0]
    print('Valid AUC-ROC: {:.4f}'.format(roc_auc_score(valid['is_click'], y_pred)))

if __name__ == '__main__':
    main()

Data Imported
Starting Data preprocessing
DateTime Converted


MaybeEncodingError: Error sending result: '[(<972031x3913 sparse matrix of type '<class 'numpy.float64'>'
	with 257739443 stored elements in Compressed Sparse Row format>, Pipeline(memory=None,
     steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=operator.itemgetter('email_body'), inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate=False)), ('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, d...ear_tf=False,
        token_pattern='\\w+', tokenizer=None, use_idf=True,
        vocabulary=None))]))]'. Reason: 'error("'i' format requires -2147483648 <= number <= 2147483647",)'