# Exploratory analysis with visualizations 📊

## Table of content

* [1. Importing librairies](#Importing-librairies)
* [2. Functions for visualization](#Functions-for-visualization)
* [3. Functions for data processing](#Functions-for-data-processing)
* [4. Loading data](#Loading-data)
* [5. Data processing](#Data-processing)
* [6. Data analysis](#Data-analysis)
* [7. Data visualization](#Data-visualization)



### Importing librairies

In [1]:
import pandas as pd
import dask.dataframe as dd
import time

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

### Functions for visualization

I get these functions from [this kernel](https://www.kaggle.com/aditi2009/titanic-data-science-solution).

In [28]:
def plot_categories(df, cat, target, **kwargs):
    row = kwargs.get('row', None)
    col = kwargs.get('col', None)
    facet = sns.FacetGrid(df, row=row, col=col, size=4, aspect=2)
    facet.map(sns.barplot, cat, target)
    facet.add_legend()
    plt.show()

def plot_distribution(df, var, target, **kwargs):
    row = kwargs.get('row', None)
    col = kwargs.get('col', None)
    facet = sns.FacetGrid(df, hue=target, size=4, aspect=4, row=row, col=col)
    facet.map(sns.kdeplot, var, shade=True)
    facet.set(xlim=(0, df[var].max()))
    facet.add_legend()
    plt.show()

def plot_correlation_map(df):
    corr = df.corr()
    _, ax = plt.subplots(figsize=(12, 10))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    _ = sns.heatmap(
        corr,
        cmap=cmap,
        square=True,
        cbar_kws={'shrink': .9},
        ax=ax,
        annot=True,
        annot_kws={'fontsize': 12}
    )
    plt.show()

def describe_more(df):
    var = [];
    l = [];
    t = []
    for x in df:
        var.append(x)
        l.append(len(pd.value_counts(df[x])))
        t.append(df[x].dtypes)
    levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t})
    levels.sort_values(by='Levels', inplace=True)
    return levels

### Functions for data processing

In [11]:
def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time'])
    df['click_hour'] = df['click_time'].apply(lambda x: x.strftime('%H')).astype(int)

    return df

def dataPreProcess(df):
    df = dataPreProcessTime(df)
    df = df.fillna(0)
    return df

### Loading data

I use the code that I shared in [this post](https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/51809#295230).

In [6]:
path = '../input/'
train = dd.read_csv(path + "train.csv")
freq = 0.02
train = train.random_split([freq, 1-freq], random_state=42)[0]
train = train.compute()

train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']

### Data processing

In [12]:
train = dataPreProcess(train)

### Data analysis

In [13]:
train.head()

In [14]:
train.shape

In [15]:
describe_more(train)

### Data visualization

In [23]:
plot_categories( train , cat = 'click_hour' , target = 'is_attributed' )

In [30]:
plot_distribution( train , var = 'os' , target = 'is_attributed' )

In [31]:
plot_distribution( train , var = 'channel' , target = 'is_attributed' )

In [32]:
plot_distribution( train , var = 'app' , target = 'is_attributed' )

In [27]:
plot_correlation_map(train)