# Data Observation

## Import packages

In [1]:
import os
import re

from tqdm import tqdm

import numpy as np
import pandas as pd

## Read data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
users = pd.read_csv('data/users.csv')

## Data Description

### Training Data & Testing Data
  
|Column Name                              |Column Description                                                        |
|-----------------------------------------|--------------------------------------------------------------------------|
|**country_code**                         |_An integer code for the country where the user lives_
|**grass_date**                           |_The date when the email was sent_
|**user_id**                              |_the unique identifier of each user_
|**subject_line_length**                  |_the number of characters in the subject of the email_
|**last_[open/login/checkout]_day**       |_the number of days since the previous email open, user login, or checkout_
|**open_count_last_[10/30/60]_days**      |_the total number of email opens in the last N days_
|**login_count_last_[10/30/60]_days**     |_the total number of user logins in the last N days_
|**checkout_count_last_[10/30/60]_days**  |_the total number of checkouts by the user in the last N days_
|**open_flag**                            |_the target variable. Whether or not the email was opened_
|**row_id**                               |

In [3]:
train.head()

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,login_count_last_10_days,login_count_last_30_days,login_count_last_60_days,checkout_count_last_10_days,checkout_count_last_30_days,checkout_count_last_60_days,open_flag,row_id
0,4,2019-07-16 00:00:00+08:00,43,44,19,6,18,0,2,4,12,43,99,0,5,10,0,0
1,4,2019-07-16 00:00:00+08:00,102,44,9,4,8,2,9,17,18,48,90,1,1,4,1,1
2,6,2019-07-16 00:00:00+08:00,177,49,14,5,5,0,4,12,24,69,119,5,19,27,0,2
3,1,2019-07-16 00:00:00+08:00,184,49,49,9,53,0,0,1,9,23,69,1,3,6,0,3
4,6,2019-07-16 00:00:00+08:00,221,49,227,6,221,0,0,0,2,5,5,0,0,0,0,4


### Users Data

|Column Name       |Column Description                                                                                  |
|------------------|----------------------------------------------------------------------------------------------------|
|**user_id**       |*the unique identifier of each user*
|**attr_[1/2/3]**  |*general user attributes*
|**age**           |*The user's reported age*
|**domain**        |*The user's top-level email domain. Less common domains are bundled together under the label 'other'*

In [4]:
users.head()

Unnamed: 0,user_id,attr_1,attr_2,attr_3,age,domain
0,0,,1.0,0.0,,@gmail.com
1,1,1.0,1.0,2.0,50.0,@gmail.com
2,2,,1.0,0.0,,other
3,3,,1.0,0.0,,@gmail.com
4,4,1.0,1.0,2.0,33.0,@gmail.com


## Observation

In [5]:
train.shape

(73539, 18)

In [6]:
columns = list(train.columns)
len(columns)

18

In [7]:
columns.remove('user_id')
columns.remove('open_flag')
columns.remove('row_id')

In [8]:
len(columns) # The input sequence length

15

In [9]:
# 全部都是 GMT+8
len(train[train['grass_date'].str.contains('\+08:00')])

73539

In [19]:
for col in columns:
    print(train[train[col].astype(str).str.contains('Never')][col])
    print('\n')

Series([], Name: country_code, dtype: int64)


Series([], Name: grass_date, dtype: object)


Series([], Name: subject_line_length, dtype: int64)


14       Never open
23       Never open
24       Never open
71       Never open
89       Never open
            ...    
73487    Never open
73490    Never open
73491    Never open
73492    Never open
73536    Never open
Name: last_open_day, Length: 7518, dtype: object


1739     Never login
7022     Never login
11759    Never login
12159    Never login
12170    Never login
            ...     
54478    Never login
60991    Never login
67568    Never login
71812    Never login
73536    Never login
Name: last_login_day, Length: 279, dtype: object


26       Never checkout
62       Never checkout
121      Never checkout
127      Never checkout
179      Never checkout
              ...      
73529    Never checkout
73532    Never checkout
73536    Never checkout
73537    Never checkout
73538    Never checkout
Name: last_checkout_day, Length: 169