In [5]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [4]:
# a helper function for writing prediction to a file

def write_to_submission_file(predicted_labels,out_file,target = 'target',index_label = "session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1,predicted_labels.shape[0] + 1),
                                columns = [target])
    predicted_df.to_csv(out_file, index_label=index_label)

    

**Reading training and testing data**

In [6]:
train_df = pd.read_csv("Data/train_sessions.csv", index_col="session_id")
test_df = pd.read_csv("Data/test_sessions.csv",index_col="session_id")



In [7]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,,,,,,,,,...,,,,,,,,,,0
2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,2014-02-22 11:19:51,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,2013-12-16 16:40:19,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,2014-03-28 10:54:12,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,2014-02-28 10:55:23,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 253561 entries, 1 to 253561
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   site1   253561 non-null  int64  
 1   time1   253561 non-null  object 
 2   site2   250098 non-null  float64
 3   time2   250098 non-null  object 
 4   site3   246919 non-null  float64
 5   time3   246919 non-null  object 
 6   site4   244321 non-null  float64
 7   time4   244321 non-null  object 
 8   site5   241829 non-null  float64
 9   time5   241829 non-null  object 
 10  site6   239495 non-null  float64
 11  time6   239495 non-null  object 
 12  site7   237297 non-null  float64
 13  time7   237297 non-null  object 
 14  site8   235224 non-null  float64
 15  time8   235224 non-null  object 
 16  site9   233084 non-null  float64
 17  time9   233084 non-null  object 
 18  site10  231052 non-null  float64
 19  time10  231052 non-null  object 
 20  target  253561 non-null  int64  
dtypes: float64(9), 

**The time columns is object type, we need to convert it to datatime format**

In [36]:
# covverting time1,time2,....time10 columns to datatime format....

times = ["time%s" % i for i in range(1,11)]
sites = ["site%s" % i for i in range(1,11)]

train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

In [37]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 253561 entries, 21669 to 204762
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   site1   253561 non-null  int64         
 1   time1   253561 non-null  datetime64[ns]
 2   site2   253561 non-null  float64       
 3   time2   253561 non-null  datetime64[ns]
 4   site3   253561 non-null  float64       
 5   time3   253561 non-null  datetime64[ns]
 6   site4   253561 non-null  float64       
 7   time4   253561 non-null  datetime64[ns]
 8   site5   253561 non-null  float64       
 9   time5   253561 non-null  datetime64[ns]
 10  site6   253561 non-null  float64       
 11  time6   253561 non-null  datetime64[ns]
 12  site7   253561 non-null  float64       
 13  time7   253561 non-null  datetime64[ns]
 14  site8   253561 non-null  float64       
 15  time8   253561 non-null  datetime64[ns]
 16  site9   253561 non-null  float64       
 17  time9   253561 non-null  datet

In [38]:
# Sort the data by time

train_df = train_df.sort_values(by= "time1")

In [39]:
train_df[sites].isna().sum(), train_df[times].isna().sum()

(site1     0
 site2     0
 site3     0
 site4     0
 site5     0
 site6     0
 site7     0
 site8     0
 site9     0
 site10    0
 dtype: int64,
 time1     0
 time2     0
 time3     0
 time4     0
 time5     0
 time6     0
 time7     0
 time8     0
 time9     0
 time10    0
 dtype: int64)

**Cleaning up the NaNs**

In [40]:
train_df.fillna(0,inplace=True)

In [41]:
train_df.shape

(253561, 21)

**Changing the data set into text format**

We are doing this cause, fitting logistic regression into the dataset we need a matrix of dataset and a column vector of labels/target variable.

But there are 250,000 dataset and hence 250,000 rows if we represent dataset with a dense matrix.

So what we need to do change the data matrix to sparse matrix.

We can do this by using countVecotrizer like we did with the analysis of imdb movies.

But the countvectorizer works only on text.

In [51]:
train_df[sites].astype('int').to_csv('train_sessions_text.txt', sep=' ',index=None,header=None)

In [50]:
# same thing with test_df
test_df.fillna(0,inplace=True)
test_df.isna().sum()

  test_df.fillna(0,inplace=True)


site1     0
time1     0
site2     0
time2     0
site3     0
time3     0
site4     0
time4     0
site5     0
time5     0
site6     0
time6     0
site7     0
time7     0
site8     0
time8     0
site9     0
time9     0
site10    0
time10    0
dtype: int64

In [52]:
#doing same thing with the test data
test_df[sites].astype('int').to_csv('test_sessions_text.txt',sep = ' ', index=None,header = None)

In [53]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947
