# 1. EDA_Datetime
Reference:
- [NanoMathias, Feature Engineering & Importance Testing](https://www.kaggle.com/nanomathias/feature-engineering-importance-testing)
- [NanoMathias, Bayesian Tuning of xgBoost & lightGBM | LB: 0.9769](https://www.kaggle.com/nanomathias/bayesian-tuning-of-xgboost-lightgbm-lb-0-9769)
- [gopisaran, Indepth EDA - Entire TalkingData dataset](https://www.kaggle.com/gopisaran/indepth-eda-entire-talkingdata-dataset)

## Run name

In [None]:
import time

project_name = 'TalkingdataAFD2018'
step_name = 'EDA_Datetime'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = '%s_%s_%s' % (project_name, step_name, time_str)
print('run_name: %s' % run_name)
t0 = time.time()

## Important params

## Import PKGs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
import gc
import time
import random
import zipfile
import h5py
import pickle
import math
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing
from multiprocessing import cpu_count

from sklearn.metrics import roc_auc_score

## Project folders

In [None]:
cwd = os.getcwd()

input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t%s' % input_folder)
print('output_folder: \t\t\t%s' % output_folder)
print('model_folder: \t\t\t%s' % model_folder)
print('log_folder: \t\t\t%s' % log_folder)

train_csv_file = os.path.join(input_folder, 'train.csv')
train_sample_csv_file = os.path.join(input_folder, 'train_sample.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
sample_submission_csv_file = os.path.join(input_folder, 'sample_submission.csv')

print('\ntrain_csv_file: \t\t%s' % train_csv_file)
print('train_sample_csv_file: \t\t%s' % train_sample_csv_file)
print('test_csv_file: \t\t\t%s' % test_csv_file)
print('sample_submission_csv_file: \t%s' % sample_submission_csv_file)

## Load data

In [None]:
%%time
train_csv = pd.read_csv(train_csv_file, parse_dates=['click_time'])
test_csv = pd.read_csv(test_csv_file, parse_dates=['click_time'])
sample_submission_csv = pd.read_csv(sample_submission_csv_file)

print('train_csv.shape: \t\t', train_csv.shape)
print('test_csv.shape: \t\t', test_csv.shape)
print('sample_submission_csv.shape: \t', sample_submission_csv.shape)
print('train_csv.dtypes: \n', train_csv.dtypes)

display(train_csv.head(2))
display(test_csv.head(2))
display(sample_submission_csv.head(2))

## Analyses

In [None]:
train_csv['day'] = train_csv['click_time'].dt.day.astype('uint8')
train_csv['hour'] = train_csv['click_time'].dt.hour.astype('uint8')
train_csv['minute'] = train_csv['click_time'].dt.minute.astype('uint8')
train_csv['second'] = train_csv['click_time'].dt.second.astype('uint8')
print('train_csv.shape: \t', train_csv.shape)
display(train_csv.head(2))

In [None]:
test_csv['day'] = test_csv['click_time'].dt.day.astype('uint8')
test_csv['hour'] = test_csv['click_time'].dt.hour.astype('uint8')
test_csv['minute'] = test_csv['click_time'].dt.minute.astype('uint8')
test_csv['second'] = test_csv['click_time'].dt.second.astype('uint8')
print('test_csv.shape: \t', test_csv.shape)
display(test_csv.head(2))

In [None]:
train_gp_day = train_csv.groupby(['day']).size()
print(train_gp_day.shape)
print(train_gp_day)

In [None]:
test_gp_day = test_csv.groupby(['day']).size()
print(test_gp_day.shape)
print(test_gp_day)

In [None]:
train_gp_day = train_csv.groupby(['day', 'hour']).size()
print(train_gp_day.shape)
print(train_gp_day)
train_gp_day.plot(kind='bar', figsize=(20, 20))

In [None]:
test_gp_day = test_csv.groupby(['day', 'hour']).size()
print(test_gp_day.shape)
print(test_gp_day)
test_gp_day.plot(kind='bar', figsize=(5, 5))

In [None]:
print(run_name)
print('Time cost: %.2f s' % (time.time() - t0))

print('Done!')