In [85]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

In [113]:
df = df.sort_values(by="time1")
df_test = df_test.sort_values(by="time1")

In [128]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82797 entries, 65539 to 60140
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   session_id  82797 non-null  int64  
 1   site1       82797 non-null  int64  
 2   time1       82797 non-null  object 
 3   site2       81308 non-null  float64
 4   time2       81308 non-null  object 
 5   site3       80075 non-null  float64
 6   time3       80075 non-null  object 
 7   site4       79182 non-null  float64
 8   time4       79182 non-null  object 
 9   site5       78341 non-null  float64
 10  time5       78341 non-null  object 
 11  site6       77566 non-null  float64
 12  time6       77566 non-null  object 
 13  site7       76840 non-null  float64
 14  time7       76840 non-null  object 
 15  site8       76151 non-null  float64
 16  time8       76151 non-null  object 
 17  site9       75484 non-null  float64
 18  time9       75484 non-null  object 
 19  site10      74806 non

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 21668 to 204761
Data columns (total 22 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   session_id  253561 non-null  int64         
 1   site1       253561 non-null  int64         
 2   time1       253561 non-null  datetime64[ns]
 3   site2       250098 non-null  float64       
 4   time2       250098 non-null  datetime64[ns]
 5   site3       246919 non-null  float64       
 6   time3       246919 non-null  datetime64[ns]
 7   site4       244321 non-null  float64       
 8   time4       244321 non-null  datetime64[ns]
 9   site5       241829 non-null  float64       
 10  time5       241829 non-null  datetime64[ns]
 11  site6       239495 non-null  float64       
 12  time6       239495 non-null  datetime64[ns]
 13  site7       237297 non-null  float64       
 14  time7       237297 non-null  datetime64[ns]
 15  site8       235224 non-null  float64       
 16

In [129]:
for i in range(1, 11):
    df['time{}'.format(i)] = pd.to_datetime(df['time{}'.format(i)])
for i in range(1, 11):
    df_test['time{}'.format(i)] = pd.to_datetime(df['time{}'.format(i)])

In [130]:
class DataPreparation(BaseEstimator, TransformerMixin):
    """Fill Nan values with zeros"""
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        sites = ['site%s' % i for i in range(1, 11)]
        return X[sites].fillna(0).astype('int')

In [131]:
class ListPreparation(BaseEstimator, TransformerMixin):
    """"Prepare a CountVectorizer"""
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.values.tolist()
        return [' '.join([str(site) for site in row]) 
                for row in X]

In [132]:
class AttributesAdder(BaseEstimator, TransformerMixin):
    """Add new attributes to training and test set"""
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Daily features
        hour = X['time1'].apply(lambda ts: ts.hour)
        morning = ((hour >= 7) & (hour <= 11)).astype(int)
        day = ((hour >= 12) & (hour <= 18)).astype(int)
        evening = ((hour >= 19) & (hour <= 23)).astype(int)
        # Seasoning features
        month = X['time1'].apply(lambda ts: ts.month)
        winter = ((month >= 12) & (month <= 2)).astype('int')
        spring = ((month >= 3) & (month <= 5)).astype('int')
        summer = ((month >= 6) & (month <= 8)).astype('int')
        autumn = ((month >= 9) & (month <= 11)).astype('int')
        # Day of the week features 
        weekday = X['time1'].apply(lambda ts: ts.weekday()).astype('int')
        # Year features
        year = X['time1'].apply(lambda ts: ts.year).astype('int')
        
        X = np.c_[morning.values, day.values, evening.values,
                 month.values, winter.values, spring.values,
                 summer.values, autumn.values, weekday.values,
                 year.values]
        return X

In [133]:
class ScaledAttriburesAdder(BaseEstimator, TransformerMixin):
    """Add new scaled features"""
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # Session features
        times = ['times%s' %i for i in range(1, 11)]
        session_duration = (X[times].max(axis=1) - X[times].min(axis=1)).astype('timedelta64[ms]').astype('int') ** 0.2
        # Number of sites visited in a session
        number_of_sites = X[times].isnull().sum(axis=1).apply(lambda x: 10 - x)
        # Average time spent on one site during a session
        time_per_site = (session_duration / number_of_sites) ** 0.2
        X = np.c_[seesion_duration.values, number_of_sites.values]
        return X

In [134]:
vectorizer_pipeline = Pipeline([
    ('preparation', DataPreparation()),
    ('list_preparation', ListPreparation()),
    ('vectorizer', CountVectorizer(ngram_range=(1, 3), max_features=50000))
])

attribute_pipeline = Pipeline([
    ('adder', AttributesAdder())
])

scaled_attributes_pipeline = Pipeline([
    ('adder', ScaledAttributesAdder()),
    ('scaler', StandardScaler())
])

In [135]:
full_pipeline = FeatureUnion(transformer_list=[
    ('vectorizer_pipeline', vectorizer_pipeline),
    ('attribute_pipeline', attribute_pipeline),
    ('scaled_attributes_pipeline', scaled_attributes_pipeline)
])

In [136]:
X_train = full_pipeline.fit_transform(df)
X_test = full_pipeline.transform(df_test)

y_train = df['target'].astype('int').values

In [139]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [142]:
time_split = TimeSeriesSplit(n_splits=10)

logit = LogisticRegression(C=1,
                           random_state=42,
                           solver='liblinear')

cv_scores = cross_val_score(logit, X_train, y_train,
                            cv=time_split, scoring='roc_auc')

print(cv_scores)
print(cv_scores.mean())

[0.83269541 0.64980325 0.93836552 0.974708   0.91029649 0.94863435
 0.95936607 0.93944128 0.96444768 0.9583448 ]
0.9076102853539567


In [143]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=42, solver='liblinear')

In [144]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [145]:
preds = logit.predict_proba(X_test)[:, 1]

write_to_submission_file(preds, 'submission.csv')