# Feature Engineering for Predicting Sign Up Rate
Summary:   
- The original data set is based on records not users. It is necessary to transfer and engineer features based on users. The reason is simple: each user can have multiple records. Once they sign-up successfully then they are converted. Notably, I can't include behavior special (Demo behavior) for sign-up users as it leaks information that users with this behavior is sign-up user.
- The features I choose are:    
total visit times per week   
pageview times  
btnClick times   
index_leave times   
about_leave times   
courses_leave times   
courses_play_leave times   
latest_utm_term (I will extract key words from user searching query as it indicates the purpose)     
latest_utm_medium (It indicates how user find Sensor website)    
average_page_stay_time (I use median to replace NaN value as mean is biased by heavy users)  
- Rebuild sign-up labels for each user (0 for non sign-up and 1 for sign-up)

In [1]:
import pandas as pd
import numpy as np

#### 1. I want to get the total visit times per week for distinct users.

In [2]:
raw_data = pd.read_csv('sensor_data_remove_missing.csv', sep='\t')
tot_visits_per_week = raw_data.groupby(['distinct_id'])['time'].count().rename("tot_visits_per_week")
tot_visits_per_week[0:5]

  interactivity=interactivity, compiler=compiler, result=result)


distinct_id
00007ef910b6c9911f1b89d01a09aa3fc862f4a9     6
000a216b72eff19bd0d5e17b9e676dd6ad9a38ac     1
000c46a27ef69fa22b56d253a9c72773338a1686    22
000ed1dcd942969b458c5b308937c6389c08f999     4
00111feff544ef5280a4c7064a362a9ea59c9389     1
Name: tot_visits_per_week, dtype: int64

In [3]:
raw_data.columns

Index(['nocache', 'distinct_id', 'event', 'jssdk_error', 'time', 'type',
       'lib_lib', 'lib_method', 'lib_version', 'browser', 'browser_version',
       'ip', 'is_first_day', 'is_first_time', 'latest_referrer',
       'latest_referrer_host', 'latest_utm_campaign', 'latest_utm_content',
       'latest_utm_medium', 'latest_utm_source', 'latest_utm_term',
       'property_lib', 'property_lib_version', 'model', 'os', 'os_version',
       'referrer', 'referrer_host', 'screen_height', 'screen_width', 'title',
       'url', 'url_path', 'utm_campaign', 'utm_content', 'utm_medium',
       'utm_source', 'utm_term', 'latest_ch', 'session_from_url',
       'session_referrer', 'session_referrer_domain', 'session_referrer_host',
       'ch', 'company', 'contact', 'email', 'from_url', 'info', 'isMsg',
       'isSuccess', 'name', 'page', 'pagePosition', 'pageStayTime', 'pageUrl',
       'project_name', 'referrHostUrl', 'referrerUrl', 'requestBtn', 'result',
       'session_page_url', 'siteUrl', 's

In [4]:
raw_data['event'].value_counts(dropna = False)

$pageview                32620
btnClick                 13866
index_leave              10394
demo_leave                3411
about_leave               1032
courses_leave              906
formSubmit                 791
courses_play_leave         747
click_send_cellphone       600
verify_cellphone_code      563
clickSubmit                513
page_close                 230
Name: event, dtype: int64

#### 2. I want to get the pageview times, btnClick times, index_leave times, about_leave times, courses_leave times and courses_play_leave times for each user.

In [5]:
users = list(tot_visits_per_week.index)
event_count = raw_data.groupby(['distinct_id', 'event'])['event'].count()
pd.DataFrame(event_count).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,event
distinct_id,event,Unnamed: 2_level_1
00007ef910b6c9911f1b89d01a09aa3fc862f4a9,$pageview,1
00007ef910b6c9911f1b89d01a09aa3fc862f4a9,btnClick,1
00007ef910b6c9911f1b89d01a09aa3fc862f4a9,click_send_cellphone,2
00007ef910b6c9911f1b89d01a09aa3fc862f4a9,formSubmit,1
00007ef910b6c9911f1b89d01a09aa3fc862f4a9,verify_cellphone_code,1


In [6]:
pageview_times = list()
btnClick_times = list()
index_leave_times = list()
about_leave_times = list()
courses_leave_times = list()
courses_play_leave_times = list() 
for user in users:
    if ('$pageview' in event_count[user]):
        pageview_times.append(event_count[user]['$pageview'])
    else:
        pageview_times.append(0)
    if ('btnClick' in event_count[user]):
        btnClick_times.append(event_count[user]['btnClick'])
    else:
        btnClick_times.append(0)
    if ('index_leave' in event_count[user]):
        index_leave_times.append(event_count[user]['index_leave'])
    else:
        index_leave_times.append(0)
    if ('about_leave' in event_count[user]):
        about_leave_times.append(event_count[user]['about_leave'])
    else:
        about_leave_times.append(0)
    if ('courses_leave' in event_count[user]):
        courses_leave_times.append(event_count[user]['courses_leave'])
    else:
        courses_leave_times.append(0)
    if ('courses_play_leave' in event_count[user]):
        courses_play_leave_times.append(event_count[user]['courses_play_leave'])
    else:
        courses_play_leave_times.append(0)

#### 3. I want to get latest_utm_term features for each user. It indicates users' purpose for visiting.

I think the latest_utm_term reflects the key purpose of users. To make it simpler, I will group and transform the terms into vector based on key words.

In [7]:
sensor_term = list()
persona_term = list()
game_term = list()
finance_term = list()
web_term = list()
client_term = list()
ecom_term = list()
ad_term = list()
app_term = list()
ret_term = list()
pic_term = list()
data_term = list()
other_term = list()
null_term = list()
for user in users:
    otherflag = 0
    temp = ''.join(list(str(raw_data[raw_data['distinct_id'] == user]['latest_utm_term'])))
    if ('NaN' in temp):
        null_term.append(1)
        otherflag = 1
    else:
        null_term.append(0)
    if ('神策' in temp or 'sensor' in temp or 'Sensor' in temp):
        sensor_term.append(1)
        otherflag = 1
    else:
        sensor_term.append(0)
    if ('画像' in temp):
        persona_term.append(1)
        otherflag = 1
    else:
        persona_term.append(0)
    if ('游戏' in temp or '手游' in temp):
        game_term.append(1)
        otherflag = 1
    else:
        game_term.append(0)
    if ('金融' in temp):
        finance_term.append(1)
        otherflag = 1
    else:
        finance_term.append(0)
    if ('网页' in temp or '网站' in temp):
        web_term.append(1)
        otherflag = 1
    else:
        web_term.append(0)
    if ('用户' in temp or '客户' in temp):
        client_term.append(1)
        otherflag = 1
    else:
        client_term.append(0)
    if ('电商' in temp):
        ecom_term.append(1)
        otherflag = 1
    else:
        ecom_term.append(0)
    if ('广告' in temp):
        ad_term.append(1)
        otherflag = 1
    else:
        ad_term.append(0)
    if ('app' in temp or 'App' in temp):
        app_term.append(1)
        otherflag = 1
    else:
        app_term.append(0)
    if ('留存' in temp):
        ret_term.append(1)
        otherflag = 1
    else:
        ret_term.append(0)
    if ('图' in temp or '可视' in temp):
        pic_term.append(1)
        otherflag = 1
    else:
        pic_term.append(0)
    if ('数据' in temp or '分析' in temp):
        data_term.append(1)
        otherflag = 1
    else:
        data_term.append(0)
    if (otherflag == 0):
        other_term.append(1)
    else:
        other_term.append(0)

#### 4. I want to get latest_utm_medium features for each user. 

In [8]:
raw_data['latest_utm_medium'].value_counts(dropna = False)

cpc                   34623
NaN                   25982
mcpc                   3255
mfeed                   934
default                 538
answer                  133
banner                   67
hao.caibaojian.com       33
production               27
sidonghui                24
toutiao.io               20
referral                 15
youjian                   8
lixinya                   6
customer_m                5
sidashidai                2
edm                       1
Name: latest_utm_medium, dtype: int64

In [9]:
cpc_medium = list()
mcpc_medium = list()
mfeed_medium = list()
default_medium = list()
answer_medium = list()
banner_medium = list()
caibaojian_medium = list()
production_medium = list()
sidonghui_medium = list()
toutiao_medium = list()
referral_medium = list()
youjian_medium = list()
lixinya_medium = list()
customer_medium = list()
sidashidai_medium = list()
edm_medium = list()
null_medium = list()
for user in users:
    temp = set(raw_data[raw_data['distinct_id'] == user]['latest_utm_medium'])
    if (np.nan in temp):
        null_medium.append(1)
    else:
        null_medium.append(0)
    if ('cpc' in temp):
        cpc_medium.append(1)
    else:
        cpc_medium.append(0)
    if ('mcpc' in temp):
        mcpc_medium.append(1)
    else:
        mcpc_medium.append(0)
    if ('mfeed' in temp):
        mfeed_medium.append(1)
    else:
        mfeed_medium.append(0)
    if ('default' in temp):
        default_medium.append(1)
    else:
        default_medium.append(0)
    if ('answer' in temp):
        answer_medium.append(1)
    else:
        answer_medium.append(0)
    if ('banner' in temp):
        banner_medium.append(1)
    else:
        banner_medium.append(0)
    if ('hao.caibaojian.com' in temp):
        caibaojian_medium.append(1)
    else:
        caibaojian_medium.append(0)
    if ('production' in temp):
        production_medium.append(1)
    else:
        production_medium.append(0)
    if ('sidonghui' in temp):
        sidonghui_medium.append(1)
    else:
        sidonghui_medium.append(0)
    if ('toutiao.io' in temp):
        toutiao_medium.append(1)
    else:
        toutiao_medium.append(0)
    if ('referral' in temp):
        referral_medium.append(1)
    else:
        referral_medium.append(0)
    if ('lixinya' in temp):
        lixinya_medium.append(1)
    else:
        lixinya_medium.append(0)
    if ('customer_m' in temp):
        customer_medium.append(1)
    else:
        customer_medium.append(0)
    if ('sidashidai' in temp):
        sidashidai_medium.append(1)
    else:
        sidashidai_medium.append(0)
    if ('edm' in temp):
        edm_medium.append(1)
    else:
        edm_medium.append(0)
    if ('youjian' in temp):
        youjian_medium.append(1)
    else:
        youjian_medium.append(0)
    

#### 5. I want to get average page stay time features for each user. 

In [10]:
average_page_stay_time = raw_data.groupby(['distinct_id'])['pageStayTime'].mean()
average_page_stay_time[0:5]

distinct_id
00007ef910b6c9911f1b89d01a09aa3fc862f4a9              NaN
000a216b72eff19bd0d5e17b9e676dd6ad9a38ac    921142.371000
000c46a27ef69fa22b56d253a9c72773338a1686      1742.216571
000ed1dcd942969b458c5b308937c6389c08f999        30.680000
00111feff544ef5280a4c7064a362a9ea59c9389              NaN
Name: pageStayTime, dtype: float64

In [11]:
np.nanmean(average_page_stay_time), np.nanmedian(average_page_stay_time)

(7860.027331669838, 32.362500000000004)

In [12]:
average_page_stay_time = average_page_stay_time.fillna(32.3625)
average_page_stay_time[0:5]

distinct_id
00007ef910b6c9911f1b89d01a09aa3fc862f4a9        32.362500
000a216b72eff19bd0d5e17b9e676dd6ad9a38ac    921142.371000
000c46a27ef69fa22b56d253a9c72773338a1686      1742.216571
000ed1dcd942969b458c5b308937c6389c08f999        30.680000
00111feff544ef5280a4c7064a362a9ea59c9389        32.362500
Name: pageStayTime, dtype: float64

#### 6. I need to get signup label (conversion label) for each user. 


In [13]:
isSuccess = list()
for user in users:
    temp = set(raw_data[raw_data['distinct_id'] == user]['isSuccess'])
    if (True in temp):
        isSuccess.append(1)
    else:
        isSuccess.append(0)

#### 7. Make data set for model building

In [14]:
data_engineered = pd.DataFrame(
    {'distinct_id': users,
     'tot_visits_per_week': list(tot_visits_per_week),
     'pageview_times': pageview_times,
     'btnClick_times': btnClick_times,
     'index_leave_times': index_leave_times,
     'about_leave_times': about_leave_times,
     'courses_leave_times': courses_leave_times,
     'courses_play_leave_times': courses_play_leave_times,
     'sensor_term': sensor_term,
     'persona_term': persona_term,
     'game_term': game_term,
     'finance_term': finance_term,
     'web_term': web_term,
     'client_term': client_term,
     'ecom_term': ecom_term,
     'ad_term': ad_term,
     'app_term': app_term,
     'ret_term': ret_term,
     'pic_term': pic_term,
     'data_term': data_term,
     'other_term': other_term,
     'null_term': null_term,
     'cpc_medium': cpc_medium,
     'mcpc_medium': mcpc_medium,
     'mfeed_medium': mfeed_medium,
     'default_medium': default_medium,
     'answer_medium': answer_medium,
     'banner_medium': banner_medium,
     'caibaojian_medium': caibaojian_medium,
     'production_medium': production_medium,
     'sidonghui_medium': sidonghui_medium,
     'toutiao_medium': toutiao_medium,
     'referral_medium': referral_medium,
     'youjian_medium': youjian_medium,
     'lixinya_medium': lixinya_medium,
     'customer_medium': customer_medium,
     'sidashidai_medium': sidashidai_medium,
     'edm_medium': edm_medium,
     'null_medium': null_medium,
     'average_page_stay_time': list(average_page_stay_time),
     'isSuccess': isSuccess
    })

In [15]:
data_engineered = data_engineered[[
     'distinct_id',
     'tot_visits_per_week',
     'pageview_times',
     'btnClick_times',
     'index_leave_times',
     'about_leave_times',
     'courses_leave_times',
     'courses_play_leave_times',
     'sensor_term',
     'persona_term',
     'game_term',
     'finance_term',
     'web_term',
     'client_term',
     'ecom_term',
     'ad_term',
     'app_term',
     'ret_term',
     'pic_term',
     'data_term',
     'other_term',
     'null_term',
     'cpc_medium',
     'mcpc_medium',
     'mfeed_medium',
     'default_medium',
     'answer_medium',
     'banner_medium',
     'caibaojian_medium',
     'production_medium',
     'sidonghui_medium',
     'toutiao_medium',
     'referral_medium',
     'youjian_medium',
     'lixinya_medium',
     'customer_medium',
     'sidashidai_medium',
     'edm_medium',
     'null_medium',
     'average_page_stay_time',
     'isSuccess'
]]


In [16]:
data_engineered.head(5)

Unnamed: 0,distinct_id,tot_visits_per_week,pageview_times,btnClick_times,index_leave_times,about_leave_times,courses_leave_times,courses_play_leave_times,sensor_term,persona_term,...,toutiao_medium,referral_medium,youjian_medium,lixinya_medium,customer_medium,sidashidai_medium,edm_medium,null_medium,average_page_stay_time,isSuccess
0,00007ef910b6c9911f1b89d01a09aa3fc862f4a9,6,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,32.3625,0
1,000a216b72eff19bd0d5e17b9e676dd6ad9a38ac,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,921142.371,0
2,000c46a27ef69fa22b56d253a9c72773338a1686,22,6,9,6,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1742.216571,0
3,000ed1dcd942969b458c5b308937c6389c08f999,4,2,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,30.68,0
4,00111feff544ef5280a4c7064a362a9ea59c9389,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,32.3625,0


In [17]:
data_engineered.to_csv('sensor_data_engineered.csv', sep='\t', encoding='utf-8', index = False)