In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font_scale=2)

In [2]:
train = pd.read_csv('datasets/train/train.csv', dtype={'id': 'int'})

In [None]:
train.head()

In [None]:
train.columns

# 特徵分布

In [None]:
plt.title('click distribution')
sns.countplot(train.click)

In [None]:
train.click.value_counts()

## 網站類(site_id, site_domain, site_category)

In [None]:
train.site_id.nunique()

In [None]:
train.groupby(['site_id', 'click']).size().to_frame()

In [None]:
train.site_domain.nunique()

In [None]:
train.groupby(['site_domain', 'click']).size().to_frame()

In [None]:
train.site_category.nunique()

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.title('site_category category distribution')
sns.countplot(train.site_category)

plt.subplot(1, 2, 2)
plt.title('site_category category & click distribution')
sns.countplot(x='site_category', hue='click', data=train)

In [None]:
train.groupby(['site_category']).size().to_frame('size').sort_values(by=['size'], ascending=False) # 前四類廣告數量最多

In [None]:
train.site_id.value_counts().to_frame()

In [None]:
train.site_id.value_counts().value_counts().to_frame().sort_index() # site_id 出現次數統計

In [None]:
train.site_domain.value_counts() # c4e18dd6 最多

In [None]:
train.site_domain.value_counts().value_counts().to_frame().sort_index() # site_domain 出現次數統計

In [None]:
train.site_category.value_counts()

In [None]:
train.site_category.value_counts().value_counts().to_frame().sort_index() # site_category 出現次數統計

## 應用程式類(app_id, app_domain, app_category)

In [None]:
train.app_id.nunique()

In [None]:
train.groupby(['app_id', 'click']).size().to_frame()

In [None]:
train.app_id.value_counts().value_counts().to_frame().sort_index()# app_id 出現次數統計

In [None]:
train.app_domain.nunique()

In [None]:
train.app_domain.value_counts().value_counts().to_frame().sort_index() # app_domain 出現次數統計

In [None]:
train.groupby(['app_domain', 'click']).size().to_frame()

In [None]:
train.app_category.nunique()

In [None]:
train.app_category.value_counts().value_counts().to_frame().sort_index() # app_category 出現次數統計

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.title('app_category category distribution')
sns.countplot(train.app_category)

plt.subplot(1, 2, 2)
plt.title('app_category category & click distribution')
sns.countplot(x='app_category', hue='click', data=train)

In [None]:
train.app_id.value_counts() # ecad2386

In [None]:
train.app_domain.value_counts() # 7801e8d9

In [None]:
train.app_category.value_counts() # 07d7df22

## 裝置類(device_id, device_ip, device_model, device_type, device_conn_type)

In [None]:
train.device_id.nunique()

In [None]:
train.device_ip.nunique()

In [None]:
train.device_model.nunique()

In [None]:
train.groupby(['device_model', 'click']).size().to_frame()

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.title('device_type category distribution')
sns.countplot(train.device_type)

plt.subplot(1, 2, 2)
plt.title('device_type category & click distribution')
sns.countplot(x='device_type', hue='click', data=train)

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.title('device_conn_type category distribution')
sns.countplot(train.device_conn_type)

plt.subplot(1, 2, 2)
plt.title('device_conn_type category & click distribution')
sns.countplot(x='device_conn_type', hue='click', data=train)

In [None]:
train.device_id.value_counts().to_frame() # a99f214a 極度的多，可轉二元分類

## 特別注意低頻率特別的多，需要另外處理特徵

In [None]:
train.device_ip.value_counts()

## 特別注意低頻率特別的多，需要另外處理特徵，以頻率之平均數當二元分類的切點

In [None]:
train.device_ip.value_counts().value_counts().to_frame().sort_index().mean() # device_ip 出現次數統計

In [None]:
train.device_model.value_counts()

In [None]:
train.device_type.value_counts()

In [None]:
train.device_type.value_counts().value_counts().to_frame().sort_index() # device_id 出現次數統計

In [None]:
train.device_conn_type.value_counts()

In [None]:
train.device_conn_type.value_counts().value_counts().to_frame().sort_index() # device_id 出現次數統計

# 廣告版位分類

In [None]:
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.title('banner_pos category distribution')
sns.countplot(train.banner_pos)

plt.subplot(1, 2, 2)
plt.title('banner_pos category & click distribution')
sns.countplot(x='banner_pos', hue='click', data=train)

In [None]:
train.banner_pos.value_counts() # 0, 1極度的多，可轉分三類

In [None]:
train.columns

In [None]:
train.C1.nunique(), train.C14.nunique(), train.C15.nunique(), train.C16.nunique(), train.C17.nunique(), train.C18.nunique(), train.C19.nunique(), train.C20.nunique(), train.C21.nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
train_copy = train.copy()
train_copy = train_copy.apply(pd.to_numeric, errors='ignore')

In [None]:
for col in ['C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']:
    enc = LabelEncoder()
    train_copy[col] = enc.fit_transform(train_copy[col])

In [None]:
train_copy.head()

# Heat Map
### 1. C1, device_type 相關性高
### 2. C14, C17 相關性高
### 3. C16 與click 相關係數最高

In [None]:
sns.set(font_scale=1)
plt.figure(figsize=(20, 18))
sns.heatmap(train_copy.corr().abs(), annot=True)

# 分層取樣

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, train_size=0.05, random_state=27)
for train_index, test_index in split.split(train, train["click"]):
    strat_train_set = train.loc[train_index]
    strat_train_set.to_csv("datasets/train_sampled.csv", header = True, encoding='utf8', index=False)
    break

In [None]:
train_sampled = pd.read_csv("datasets/train_sampled.csv", encoding='utf8')

### 新增特徵

In [3]:
frequency = train.site_id.value_counts().value_counts().to_frame().sort_index()

boundary = frequency.site_id.mean()

upper_count = frequency['site_id'][frequency.site_id > boundary].values
count = train.site_id.value_counts().to_frame()

train['site_id_binary'] = 0
train[train.site_id.isin(count[count.site_id.isin(upper_count)].index)] = 1

In [4]:
frequency = train.site_domain.value_counts().value_counts().to_frame().sort_index()

boundary = frequency.site_domain.mean()

upper_count = frequency['site_domain'][frequency.site_domain > boundary].values
count = train.site_domain.value_counts().to_frame()

train['site_domain_binary'] = 0
train[train.site_domain.isin(count[count.site_domain.isin(upper_count)].index)] = 1

In [5]:
frequency = train.app_id.value_counts().value_counts().to_frame().sort_index()

boundary = frequency.app_id.mean()

upper_count = frequency['app_id'][frequency.app_id > boundary].values
count = train.app_id.value_counts().to_frame()

train['app_id_binary'] = 0
train[train.app_id.isin(count[count.app_id.isin(upper_count)].index)] = 1

In [6]:
frequency = train.device_id.value_counts().value_counts().to_frame().sort_index()

boundary = frequency.device_id.mean()

upper_count = frequency['device_id'][frequency.device_id > boundary].values
count = train.device_id.value_counts().to_frame()

train['device_id_binary'] = 0
train[train.device_id.isin(count[count.device_id.isin(upper_count)].index)] = 1

In [7]:
frequency = train.device_ip.value_counts().value_counts().to_frame().sort_index()

boundary = frequency.device_ip.mean()

upper_count = frequency['device_ip'][frequency.device_ip > boundary].values
count = train.device_ip.value_counts().to_frame()

train['device_ip_binary'] = 0
train[train.device_ip.isin(count[count.device_ip.isin(upper_count)].index)] = 1

In [8]:
frequency = train.device_model.value_counts().value_counts().to_frame().sort_index()

boundary = frequency.device_model.mean()

upper_count = frequency['device_model'][frequency.device_model > boundary].values
count = train.device_model.value_counts().to_frame()

train['device_model_binary'] = 0
train[train.device_model.isin(count[count.device_model.isin(upper_count)].index)] = 1

In [None]:
train.head()

In [None]:
train.to_csv("datasets/train_processed.csv", header = True, encoding='utf8', index=False)