In [1]:
import findspark
findspark.init("/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/spark")

import os
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_181-cloudera"

from pyspark import SparkContext
from pyspark.sql import SparkSession,HiveContext,Window
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType, FloatType, DoubleType, ArrayType, StringType, DecimalType,MapType

spark_session = SparkSession.builder.enableHiveSupport().appName("test").config("spark.driver.memory","30g").getOrCreate()
hc = HiveContext(spark_session.sparkContext)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json

### 1. 汽车之家 Semi-Feature 处理

In [3]:
# load data
auto = hc.sql(
'''
SElECT 
    mobile, h_func_prefer, h_car_type_prefer, h_config_prefer, h_budget, h_level
    FROM marketing_modeling.mm_big_wide_info
WHERE 
    SUBSTR(mobile, 0, 1) = 1
    AND LENGTH(mobile) = 11
''').toPandas()

##### 功能偏好, 用途偏好, 配置偏好

In [4]:
def string_to_list(x):
    '''
    Convert string-type list to a list give:
    x(string): input string-type list    
    '''
    if x is None:
        return np.nan
    if len(x) == 2:
        return 0
    elif len(x) > 2:
        return [int(i) for i in x[1:-1].split(',')]
        
        
def get_dummy(i, x):
    '''
    Check if i is in x given:
    i(int): input element need to be checked existing status
    x(list): input list that might contains i
    '''
    if isinstance(x,int) | isinstance(x,float):
        return np.nan
    elif i in x:
        return 1
    elif i not in x:
        return 0
    else:
        print('error')


def create_dummy_columns(df, col, lst):
    '''
    Create dummy columns given:
    df(dataframe): input dataframe
    col(string): col that needed to be tranferred
    lst(list)：list that contains index of dhummy columns
    '''
    name = col + str(1)
    for i in lst:
        name = col + str(i)
        df[name] = df[col].apply(lambda x:get_dummy(i, x))
    return df

In [5]:
col  = ['h_func_prefer', 'h_car_type_prefer', 'h_config_prefer']
for i in col:
    auto[i] = auto[i].apply(string_to_list)

In [6]:
lst_param = [10, 17, 19]
for i in col:
    auto = create_dummy_columns(auto, i, list(range(1, lst_param[col.index(i)])))

##### 购车预算

In [7]:
def get_budget_min(x):
    '''
    Get the lower bound of budget given:
    x(string): input string
    '''
    if x is None:
        return np.nan
    if isinstance(x,int) | isinstance(x, float):
        return np.nan
    else:
        mmin = x.split('-')[0][:-2]
        return mmin
    
def get_budget_max(x):
    '''
    Get the upper bound of budget given:
    x(string): input string    
    '''
    if x is None:
        return np.nan
    if isinstance(x,int) | isinstance(x, float):
        return np.nan
    else:
        mmax = x.split('-')[1][1:-1]
        return mmax   

In [8]:
def get_budget_min_cat(x):
    '''
    Group the lower bound of budget given:
    x(string): input string      
    '''
    if x is None:
        return np.nan
    if isinstance(x,int) | isinstance(x, float):
        return np.nan
    else:
        mmin = x.split('-')[0][:-2]
        if mmin > 25:
            return '25w+'
        elif mmin < 10:
            return '10w以下'
        elif mmin < 15:
            return '10w-15w'
        elif mmin >= 20:
            return '20w-25w'
        elif mmin >= 15 and mmin < 20:
            return '15w-20w'
        else:
            print('error')
            
def get_budget_max_cat(x):
    '''
    Group the upper bound of budget given:
    x(string): input string    
    '''
    if x is None:
        return np.nan
    if isinstance(x,int) | isinstance(x,float):
        return np.nan
    else:
        mmax = x.split('-')[1][1:-1]
        if mmax > 25:
            return '25w+'
        elif mmax < 10:
            return '10w以下'
        elif mmax < 15:
            return '10w-15w'
        elif mmax >= 20:
            return '20w-25w'
        elif mmax >= 15 and mmax < 20:
            return '15w-20w'
        else:
            print('error')

In [9]:
auto['h_budget_min'] = auto['h_budget'].apply(lambda x:get_budget_min(x))
auto['h_budget_min_cat'] = auto['h_budget'].apply(lambda x:get_budget_min_cat(x))
auto['h_budget_max'] = auto['h_budget'].apply(lambda x:get_budget_max(x))
auto['h_budget_max_cat'] = auto['h_budget'].apply(lambda x:get_budget_max_cat(x))

##### 级别

In [10]:
level_values = auto['h_level'].unique()

def get_level_cat():
    '''
    Get the car level of a car
    '''
    level_dic = {}
    level_matches = ['轿车','SUV','MPV','跑车','微面','微卡','轻客','皮卡']
    jiaoche=[u'紧凑型车',u'小型车',u'中型车',u'中大型车',u'大型车',u'微型车']
    SUV = [u'紧凑型SUV',u'小型SUV',u'中型SUV',u'中大型SUV',u'大型SUV']
    mpv = [u'MPV']
    paoche = [u'跑车']
    weimian = [u'微面']
    weika = [u'微卡']
    qingke = [u'轻客']
    pika = [u'低端皮卡',u'高端皮卡']

    all_list = [jiaoche, SUV, mpv, paoche, weimian, weika, qingke, pika]
    
    t = 0
    for lst in all_list:
        for i in lst:
            level_dic[i] = level_matches[t]
        t += 1 
    return level_dic


def get_level_cat_all(x):
    '''
    Return the car level of a car given:
    x(string): input name of the car
    '''
    if isinstance(x,int) | isinstance(x,float):
        return np.nan
    elif x in get_level_cat().keys():
        return get_level_cat()[x]
    else:
        return np.nan

In [None]:
auto['h_level_cat'] = auto['h_level'].apply(lambda x: get_level_cat_all(x))

### 2. CDP Semi-Feature 处理

In [None]:
import sys
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf-8')

In [None]:
cdp = hc.sql('''
SELECT 
    mobile,
    c_city,
    c_province,
    c_lead_model,
    c_trail_vel,
    c_last_trail_vel,
    c_deliver_vel,
    c_last_reach_platform,
    c_lead_sources 
FROM 
    marketing_modeling.mm_big_wide_info
WHERE 
    SUBSTR(mobile, 0, 1) = 1
    AND LENGTH(mobile) = 11
''').toPandas()

In [None]:
df = cdp

##### 城市,省份

In [None]:
#city_level
ct_lv = pd.read_excel('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/city_level.xlsx')

In [None]:
def string_to_list_1(x):
    '''
    Ignore multiple cities & provinces
    x(string): input string-type list
    '''
    if isinstance(x,unicode):
        if ',' in x:
            return np.nan
        elif len(x) == 0:
            return np.nan
        else:
            return x
    else:
        return np.nan

#城市
col = 'c_city'
df[col] = df[col].apply(string_to_list_1)
df['c_city'].replace('上海市市辖区','上海市', inplace = True)
df['c_city'].replace('重庆市市辖区','重庆市', inplace = True)
df['c_city'].replace('北京市市辖区','北京市', inplace = True)
df['c_city'].replace('天津市市辖区','天津市', inplace = True)
df['c_city'].replace('重庆县','重庆市', inplace = True)

#城市级别
df['c_city_level'] = df.merge(ct_lv, how = 'left', left_on = 'c_city', right_on = 'city_name')['city_level']

#省份
col = 'c_province'
df[col] = df[col].apply(string_to_list_1)

##### 留资车系，试驾车系

In [None]:
def string_to_list_2(x):
    '''
    Convert string-type list to a list give:
    x(string): input string-type list
    '''
    if isinstance(x,unicode):
        if len(x) == 0:
            return np.nan
        else:
            return x.split(',')
    else:
        return np.nan
  

def get_model_status(lead, tral):
    '''
    Get the status of a model given the lead and trail records
    lead(int):input lead status
    tral(int):input tral status
    '''
    if (lead == 1) & (tral != 1):
        return 1
    elif (lead != 1) & (tral == 1):
        return 2
    elif (lead == 1) & (tral == 1):
        return 3
    else:
        return 0 

def get_dummy(i, x):
    '''
    Check if i is in x given:
    i(int): input element need to be checked existing status
    x(list): input list that might contains i
    '''
    if isinstance(x,int) | isinstance(x,float):
        return np.nan
    elif i in x:
        return 1
    elif i not in x:
        return 0
    else:
        print('error') 

def create_dummy_columns_2(df,col,lst):
    '''
    Create dummy columns and list of created columns given:
    df(dataframe): input dataframe
    col(string): col that needed to be tranferred
    lst(list)：list that contains index of dhummy columns
    '''
    col_names = []
    for i in lst:
        name = col + "_" + str(i)
        col_names.append(name)
        df[name]=df[col].apply(lambda x: get_dummy(i,x))
    return df, col_names    

In [None]:
models = [ "MG350（海外销售）",
            "MGeHS",
            "MGGT",
            "MG锐腾",
            "MG550（海外销售）",
            "MGeMGHS",
            "MGHS",
            "MG360（海外销售）",
            "MGRX5（海外销售）",
            "MGeMG6",
            "MG7",
            "MG750（海外销售）",
            "MG5",
            "MGZS纯电动",
            "MGZS",
            "MG3",
            "MG新MG5（海外销售）",
            "MGTF",
            "MG6"]
lst = models

In [None]:
col = 'c_lead_model'
df[col] = df[col].apply(lambda x:string_to_list_2(x))
df,lead_cols = create_dummy_columns_2(df, col, lst)

col = 'c_trail_vel'
df[col] = df[col].apply(lambda x:string_to_list_2(x))
df,tral_cols = create_dummy_columns_2(df, col, lst)

In [None]:
for i in range(len(models)):
    model = models[i]
    lead = lead_cols[i]
    tral = tral_cols[i]
    df['c_'+ model] = df.apply(lambda x:get_model_status(x[lead],x[tral]),axis = 1)

In [None]:
df[["mobile","c_city","c_province","c_city_level","c_MG350（海外销售）","c_MGeHS","c_MGGT","c_MG锐腾","c_MG550（海外销售）","c_MGeMGHS",
"c_MGHS","c_MG360（海外销售）","c_MGRX5（海外销售）","c_MGeMG6","c_MG7","c_MG750（海外销售）","c_MG5","c_MGZS纯电动","c_MGZS","c_MG3",
"c_MG新MG5（海外销售）","c_MGTF","c_MG6"]].to_csv('cdp_middle.csv')

##### 最近触达平台，历史线索渠道来源，线索来源数量

In [None]:
col = 'c_last_reach_platform'

df[col] = df[col].apply(lambda x:string_to_list_2(x))

col_list = df[col][(df[col].isnull()==False)&(df[col] != None)&(df[col]!= '')].to_list()
lst = list(set([i for k in col_list for i in k]))
df = create_dummy_columns_2(df,col,lst)[0]

In [None]:
col = 'c_lead_sources'

df[col] = df[col].apply(lambda x:string_to_list_2(x))

col_list = df[col][(df[col].isnull()==False)&(df[col] != None)&(df[col]!= '')].to_list()
lst = list(set([i for k in col_list for i in k]))
df,source_cols = create_dummy_columns_2(df,col,lst)

In [None]:
vertical_media = ['c_lead_sources_懂车帝','c_lead_sources_汽车之家','c_lead_sources_途虎','c_lead_sources_易车',
                  'c_lead_sources_17汽车','c_lead_sources_太平洋','c_lead_sources_车享CRM（品牌馆，又叫电商）','c_lead_sources_爱卡']
offcial_online = ['c_lead_sources_名爵APP','c_lead_sources_官网']


df['c_vertical_media'] = df[vertical_media].sum(axis = 1)
df['c_offcial_online'] = df[offcial_online].sum(axis = 1)

df.loc[df[df['c_lead_sources'].isnull() == True].index,'vertical_media'] = np.nan
df.loc[df[df['c_lead_sources'].isnull() == True].index,'offcial_online'] = np.nan

In [None]:
df['c_leads_source_nums'] = df[source_cols].sum(axis =1)

In [None]:
cdp = df[["mobile","c_city","c_province","c_city_level","c_MG350（海外销售）","c_MGeHS","c_MGGT","c_MG锐腾","c_MG550（海外销售）","c_MGeMGHS",
"c_MGHS","c_MG360（海外销售）","c_MGRX5（海外销售）","c_MGeMG6","c_MG7","c_MG750（海外销售）","c_MG5","c_MGZS纯电动","c_MGZS","c_MG3",
"c_MG新MG5（海外销售）","c_MGTF","c_MG6",
"c_last_reach_platform_MG服务号","c_last_reach_platform_MGAPP","c_last_reach_platform_MG官网","c_lead_sources_汽车之家",
"c_vertical_media","c_offcial_online",
"c_leads_source_nums"]]

In [None]:
cdp.to_csv('cdp.csv')

#### 3. Merge 基于汽车之家和CDP Semi-feature 生成的 Feature List

In [None]:
cdp = pd.read_csv('cdp.csv',index_col = 0)

In [None]:
auto = auto.drop(['h_func_prefer', 'h_car_type_prefer', 'h_config_prefer', 'h_budget', 'h_level'])

In [None]:
semi_feature_processed = auto[['']].merge(cdp, how = 'left', on = 'mobile')

In [None]:
semi_feature_processed.to_csv('leads_scoring_model/semi_feature_processing.csv',index = False)