# Data Analysis by sklearn

## Import packages

In [18]:
import os
import re
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

## Read Data

In [2]:
src = 'data/train.csv'

In [3]:
_train = pd.read_csv(src)
_train.sample(10)

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,login_count_last_10_days,login_count_last_30_days,login_count_last_60_days,checkout_count_last_10_days,checkout_count_last_30_days,checkout_count_last_60_days,open_flag,row_id
27150,2,2019-08-04 00:00:00+08:00,63895,36,7,22,37,3,4,6,7,17,53,0,0,8,0,27150
40977,7,2019-08-13 00:00:00+08:00,50456,66,Never open,5,14,0,0,0,2,8,22,0,2,2,0,40977
21249,5,2019-07-30 00:00:00+08:00,23218,38,1,10,29,1,1,1,22,70,130,0,1,3,0,21249
70245,1,2019-09-02 00:00:00+08:00,14628,33,55,3,5,0,0,1,15,48,110,2,5,15,0,70245
29344,1,2019-08-05 00:00:00+08:00,81503,55,116,1,Never checkout,0,0,0,1,2,7,0,0,0,0,29344
7831,2,2019-07-21 00:00:00+08:00,100607,43,20,18,Never checkout,0,1,7,1,3,5,0,0,0,0,7831
26855,7,2019-08-04 00:00:00+08:00,31059,34,3,1,6,3,5,8,12,47,101,2,13,23,1,26855
9094,3,2019-07-22 00:00:00+08:00,74779,59,3,24,62,1,7,7,1,2,16,0,0,0,1,9094
51300,2,2019-08-19 00:00:00+08:00,83290,56,45,282,Never checkout,0,0,2,0,0,0,0,0,0,0,51300
46578,1,2019-08-16 00:00:00+08:00,98453,47,37,7,92,1,1,2,3,17,71,1,1,1,0,46578


In [4]:
train = _train.copy()

## Transform data and labels into machine-recognizable data

In [5]:
columns = list(train.drop(['user_id', 'open_flag', 'row_id'], axis=1, inplace=False).columns)

In [6]:
train = train.replace('Never open', -1).replace('Never login', -1).replace('Never checkout', -1)

In [7]:
def to_timestamp(datetime):
    return datetime.timestamp()

In [8]:
# transform datetime to unit(s)
train['grass_date'] = pd.to_datetime(train['grass_date'])
train['grass_date'] = train['grass_date'].apply(to_timestamp)
train['grass_date']

0        1.563206e+09
1        1.563206e+09
2        1.563206e+09
3        1.563206e+09
4        1.563206e+09
             ...     
73534    1.567354e+09
73535    1.567354e+09
73536    1.567354e+09
73537    1.567354e+09
73538    1.567354e+09
Name: grass_date, Length: 73539, dtype: float64

In [16]:
_inputs = list(list(train.loc[i, columns]) for i in tqdm(range(len(train))))
_labels = train['open_flag'].tolist()
print(_inputs[:5])
print(_labels[:5])

100%|██████████| 73539/73539 [00:48<00:00, 1508.79it/s]

[[4, 1563206400.0, 44, '19', '6', '18', 0, 2, 4, 12, 43, 99, 0, 5, 10], [4, 1563206400.0, 44, '9', '4', '8', 2, 9, 17, 18, 48, 90, 1, 1, 4], [6, 1563206400.0, 49, '14', '5', '5', 0, 4, 12, 24, 69, 119, 5, 19, 27], [1, 1563206400.0, 49, '49', '9', '53', 0, 0, 1, 9, 23, 69, 1, 3, 6], [6, 1563206400.0, 49, '227', '6', '221', 0, 0, 0, 2, 5, 5, 0, 0, 0]]
[0, 1, 0, 0, 0]





In [17]:
inputs = np.asarray(_inputs, dtype='int32')
labels = np.asarray(_labels, dtype='int32')

## Split train-test data

In [19]:
x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2)
print(len(x_train))
print(len(x_test))

58831
14708


In [20]:
knc = KNeighborsClassifier()
knc.fit(x_train, y_train)

KNeighborsClassifier()

In [21]:
knc.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0])