# TalkingData (Kaggle)
## Pre-processing

### Import

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.model_selection import train_test_split

### Load in data

In [None]:
DATA_PATH = r"C:\Users\reio\.kaggle\competitions"

def load_data(data_path=DATA_PATH):
    # PATHS TO FILE
    competition = "talkingdata-adtracking-fraud-detection"
    comp_path = os.path.join(data_path, competition)
    train_sample = os.path.join(comp_path, "train_sample.csv")
    train_path = os.path.join(comp_path, "train.csv")
    test_path = os.path.join(comp_path, "test.csv")
    ssize = 7000000
    return pd.read_csv(train_path,nrows=ssize), pd.read_csv(test_path)

train, test = load_data()

### Data Exploration

In [None]:
# Training sample
print(train.shape)
train.head()

In [None]:
# Describe train
train.dtypes
train.max()

In [None]:
# Check NAs
train.isnull().sum()

In [None]:
# Extract data where is_attributed == 1
train_att = train[train['is_attributed']==1]
# Check NAs
train_att.isnull().sum()

We notice that all the missing values in 'attributed_time' are for observations that did not convert into a download ('is_attributed'=0).

In [None]:
# Percentage of is_attributed == 1
p = len(train_att)/len(train)
print('The percentage of converted clicks is {num:.2%}'.format(num=p))

In [None]:
# Plot the proportion of clicks that converted into a download or not
plt.figure(figsize=(6,6))
#sns.set(font_scale=1.2)
mean = (train.is_attributed.values == 1).mean()
ax = sns.barplot(['App Downloaded (1)', 'Not Downloaded (0)'], [mean, 1-mean])
ax.set(ylabel='Proportion', title='App Downloaded vs Not Downloaded')
for p, uniq in zip(ax.patches, [mean, 1-mean]):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height+0.01,
            '{}%'.format(round(uniq * 100, 2)),
            ha="center")

### Feature Engineering

In [None]:
# Set categorical variables
cat = ['ip', 'app', 'device', 'os', 'channel']
for c in cat:
    train[c] = train[c].astype('category')
    test[c]=test[c].astype('category')

# Only training data has is_attributed
train['is_attributed'] = train['is_attribute'].astype('category')

In [None]:
# Extract features from click_time
def ppClicktime(df):
    df['click_time'] = pd.to_datetime(df['click_time'])
    df['day_of_week'] = df['click_time'].dt.dayofweek
    df['week'] = df['click_time'].dt.week
    df['click_date'] = df['click_time'].dt.date
    df['click_hour'] = df['click_time'].dt.hour
    df['click_minute'] = df['click_time'].dt.minute
    return df

In [None]:
# Drop click_time
train.drop('click_time', axis = 1, inplace = True)

train.head()