<a href="https://colab.research.google.com/github/Spartan-119/Pixalate/blob/main/TalkingData_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
data_path = '/content/data.csv'

In [3]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ip               100000 non-null  int64 
 1   app              100000 non-null  int64 
 2   device           100000 non-null  int64 
 3   os               100000 non-null  int64 
 4   channel          100000 non-null  int64 
 5   click_time       100000 non-null  object
 6   attributed_time  227 non-null     object
 7   is_attributed    100000 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 6.1+ MB


In [5]:
df.dtypes

Unnamed: 0,0
ip,int64
app,int64
device,int64
os,int64
channel,int64
click_time,object
attributed_time,object
is_attributed,int64


In [6]:
# the ip, app, device, os and channel are categorical variables encoded as integers.
# setting them as categorise for analysis
variables = ['ip', 'app', 'device', 'os', 'channel']
for variable in variables:
    df[variable] = df[variable].astype('category')

df.dtypes

Unnamed: 0,0
ip,category
app,category
device,category
os,category
channel,category
click_time,object
attributed_time,object
is_attributed,int64


In [7]:
# converting date stamps to date/time type
df['click_time'] = pd.to_datetime(df['click_time'])
df['attributed_time'] = pd.to_datetime(df['attributed_time'])
df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,NaT,0
1,105560,25,1,17,259,2017-11-07 13:40:27,NaT,0
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,0
3,94584,13,1,13,477,2017-11-07 04:58:08,NaT,0
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,0


In [8]:
df.describe()

Unnamed: 0,click_time,attributed_time,is_attributed
count,100000,227,100000.0
mean,2017-11-08 06:29:52.171910144,2017-11-08 07:04:12.766519552,0.00227
min,2017-11-06 16:00:00,2017-11-06 17:19:04,0.0
25%,2017-11-07 11:34:09.500000,2017-11-07 11:50:27.500000,0.0
50%,2017-11-08 07:07:50,2017-11-08 06:43:39,0.0
75%,2017-11-09 02:06:01.249999872,2017-11-09 01:42:52,0.0
max,2017-11-09 15:59:51,2017-11-09 15:28:15,1.0
std,,,0.047591


In [9]:
count_0 = df[df['is_attributed'] == 0].shape[0]
count_1 = df[df['is_attributed'] == 1].shape[0]
print(f"Number of rows with 'is_attribute' == 0: {count_0}")
print(f"Number of rows with 'is_attribute' == 1: {count_1}")

Number of rows with 'is_attribute' == 0: 99773
Number of rows with 'is_attribute' == 1: 227


that means there 227 instances where it was clicked

In [10]:
!pip install imbalanced-learn



In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [12]:
X = df.drop(['is_attributed', 'attributed_time', 'click_time'], axis=1)
y = df['is_attributed']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
X_train.head()

Unnamed: 0,ip,app,device,os,channel
75220,105649,2,1,19,205
48955,7335,3,1,19,280
44966,1699,12,1,13,205
13568,34912,3,1,19,280
92727,48170,18,1,6,134


In [15]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [16]:
from collections import Counter
print('Original dataset shape:', Counter(y_train))
print('Resampled dataset shape:', Counter(y_train_res))

Original dataset shape: Counter({0: 79824, 1: 176})
Resampled dataset shape: Counter({0: 79824, 1: 79824})


In [17]:
df_res = pd.concat([X_train_res, y_train_res], axis=1)
df_res.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed
0,105649,2,1,19,205,0
1,7335,3,1,19,280,0
2,1699,12,1,13,205,0
3,34912,3,1,19,280,0
4,48170,18,1,6,134,0


In [20]:
df_res.describe(include = 'all')

Unnamed: 0,ip,app,device,os,channel,is_attributed
count,89121.0,155268.0,149524.0,156679.0,112272.0,159648.0
unique,31763.0,161.0,97.0,130.0,161.0,
top,5348.0,3.0,1.0,19.0,213.0,
freq,696.0,15012.0,110273.0,25142.0,7067.0,
mean,,,,,,0.5
std,,,,,,0.500002
min,,,,,,0.0
25%,,,,,,0.0
50%,,,,,,0.5
75%,,,,,,1.0
