In [3]:
import findspark
findspark.init("/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/spark")

import os
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_181-cloudera"

from pyspark import SparkContext
from pyspark.sql import SparkSession,HiveContext,Window
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType, FloatType, DoubleType, ArrayType, StringType, DecimalType,MapType

spark_session = SparkSession.builder.enableHiveSupport().appName("test").config("spark.driver.memory","30g").getOrCreate()
hc = HiveContext(spark_session.sparkContext)

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [5]:
# load column names and comments
wide_info_list = hc.sql('DESCRIBE marketing_modeling.mm_big_wide_info').toPandas()
feature_list = hc.sql('DESCRIBE marketing_modeling.wide_info_feature').toPandas()
feature_list = feature_list[['col_name']].merge(wide_info_list[['col_name','comment']], how = 'inner', on = 'col_name')

In [None]:
# load wide-info data
#wide_info = hc.sql('SElECT * FROM marketing_modeling.wide_info_feature').toPandas()

path = '/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/wide_info_feature.csv'
wide_info = pd.read_csv(path, delimiter = '\t', header = None, names = feature_list['col_name'] )
wide_info['mobile'] = wide_info['mobile'].astype(str)

# load processed semi-feature data
semi_feature_processing = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/Semi-feature处理/semi_feature_processing.csv', delimiter = ',')
semi_feature_processing['mobile'] = semi_feature_processing['mobile'].astype(str)
wide_info_all = wide_info.merge(semi_feature_processing, how = 'left', on = 'mobile')
wide_info_all['c_lead_sources_汽车之家'] = wide_info_all['c_lead_sources_汽车之家'].fillna(0)

In [6]:
#wide_info_all.to_csv('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/wide_info_all.csv')
wide_info_all = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/wide_info_all.csv', delimiter = ',', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [None]:
wide_info_all.shape

### 0. Feature Preprocessing

### 1. 基本情况

In [None]:
wide_info_all[['d_deal_flag','mobile']].groupby(['d_deal_flag']).count()

In [None]:
wide_info_all[['d_deal_flag','c_lead_sources_汽车之家', 'mobile']].groupby(['d_deal_flag', 'c_lead_sources_汽车之家']).count()

In [None]:
wide_info_all['c_lead_sources_汽车之家'].value_counts()

### 2. 特征覆盖度检查

##### 1) 检查每个用户的feature覆盖率, by 线索来源和成交标签

In [8]:
cdp_feature_list_ad = ["c_city","c_province","c_city_level","c_MG350（海外销售）","c_MGeHS","c_MGGT","c_MG锐腾","c_MG550（海外销售）","c_MGeMGHS",
"c_MGHS","c_MG360（海外销售）","c_MGRX5（海外销售）","c_MGeMG6","c_MG7","c_MG750（海外销售）","c_MG5","c_MGZS纯电动","c_MGZS","c_MG3",
"c_MG新MG5（海外销售）","c_MGTF","c_MG6",
"c_last_reach_platform_MG服务号","c_last_reach_platform_MGAPP","c_last_reach_platform_MG官网","c_lead_sources_汽车之家",
"c_vertical_media","c_offcial_online",
"c_leads_source_nums"]

autohome_feature_list_ad = ['h_func_prefer1', 'h_func_prefer2', 'h_func_prefer3','h_func_prefer4', 'h_func_prefer5', 'h_func_prefer6',
                            'h_func_prefer7', 'h_func_prefer8', 'h_func_prefer9','h_car_type_prefer1', 'h_car_type_prefer2', 
                            'h_car_type_prefer3','h_car_type_prefer4', 'h_car_type_prefer5', 'h_car_type_prefer6','h_car_type_prefer7', 
                            'h_car_type_prefer8', 'h_car_type_prefer9','h_car_type_prefer10', 'h_car_type_prefer11', 'h_car_type_prefer12',
                            'h_car_type_prefer13', 'h_car_type_prefer14', 'h_car_type_prefer15','h_car_type_prefer16', 'h_config_prefer1', 
                            'h_config_prefer2', 'h_config_prefer3', 'h_config_prefer4', 'h_config_prefer5', 'h_config_prefer6', 
                            'h_config_prefer7', 'h_config_prefer8', 'h_config_prefer9', 'h_config_prefer10', 'h_config_prefer11',
                            'h_config_prefer12', 'h_config_prefer13', 'h_config_prefer14','h_config_prefer15', 'h_config_prefer16', 
                            'h_config_prefer17', 'h_config_prefer18', 'h_budget_min', 'h_budget_min_cat', 'h_budget_max', 'h_budget_max_cat', 
                            'h_level_cat']

In [9]:
# caculate feature volumn
bins_wide =[-1, 0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200]
bins =[-1, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# 大宽表
wide_info_all['feature_vol_all'] = wide_info_all.shape[1] - wide_info_all.isnull().sum(axis=1) - 2
wide_info_all['feature_vol_all_bin'] = pd.cut(wide_info_all['feature_vol_all'], bins = bins_wide)

# DLM小宽表
wide_info_dlm = wide_info_all[['mobile'] + [i.encode('utf-8') for i in list(feature_list[feature_list['col_name'].str.contains('d_')]['col_name'])]]
wide_info_dlm['feature_vol_dlm'] = wide_info_dlm.shape[1] - wide_info_dlm.isnull().sum(axis=1) - 2
wide_info_dlm['feature_vol_dlm_bin'] = pd.cut(wide_info_dlm['feature_vol_dlm'], bins = bins)

# autohome小宽表
wide_info_autohome = wide_info_all[['mobile', 'd_deal_flag'] + autohome_feature_list_ad + \
                                   [i.encode('utf-8') for i in list(feature_list[feature_list['col_name'].str.contains('h_')]['col_name'])]]
wide_info_autohome['feature_vol_autohome'] = wide_info_autohome.shape[1] - wide_info_autohome.isnull().sum(axis=1) - 2
wide_info_autohome['feature_vol_autohome_bin'] = pd.cut(wide_info_autohome['feature_vol_autohome'], bins = bins)

# CDP小宽表 
wide_info_cdp = wide_info_all[['mobile', 'd_deal_flag'] + cdp_feature_list_ad + \
                              [i.encode('utf-8') for i in list(feature_list[(feature_list['col_name'].str.contains('c_') == True) & \
                                                                        (feature_list['col_name'].str.contains('d_') == False)]['col_name'])]]
wide_info_cdp['feature_vol_cdp'] = wide_info_cdp.shape[1] - wide_info_cdp.isnull().sum(axis=1) - 2
wide_info_cdp['feature_vol_cdp_bin'] = pd.cut(wide_info_cdp['feature_vol_cdp'], bins = bins)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

In [None]:
def count_distribution(df, col1, col2):
    '''
    Calculate the distribution group by given columns given:
    df(dataframe): input dataframe;
    col1(string): column used to group by
    col2(string): column used to group by    
    '''
    dis_df = df[['mobile',col1, col2]].groupby([col1, col2], as_index = False).count().\
    merge(df[['mobile',col1]].groupby([col1], as_index = False).count(), how = 'left', on = col1)    
    dis_df['rate'] = dis_df['mobile_x'] / dis_df['mobile_y']
    return dis_df

In [None]:
# distribution by deal_flag
plt.figure(figsize=(10, 25))

plt.subplot(411)
dis_all = count_distribution(wide_info_all, 'd_deal_flag', 'feature_vol_all_bin')
sns.barplot( x = 'feature_vol_all_bin', y = 'rate', hue = 'd_deal_flag', data = dis_all, \
           palette = sns.diverging_palette(220, 20, n = 5)).set_title("# of Features Distribution in Wide-info")
plt.legend(loc = 1)


plt.subplot(412)
dis_cdp = count_distribution(wide_info_cdp, 'd_deal_flag', 'feature_vol_cdp_bin')
sns.barplot( x = 'feature_vol_cdp_bin', y = 'rate', hue = 'd_deal_flag', data = dis_cdp, \
           palette = sns.diverging_palette(220, 20, n = 5)).set_title("# of Features Distribution in CDP")
plt.legend(loc = 1)


plt.subplot(413)
dis_dlm = count_distribution(wide_info_dlm, 'd_deal_flag', 'feature_vol_dlm_bin')
sns.barplot( x = 'feature_vol_dlm_bin', y = 'rate', hue = 'd_deal_flag', data = dis_dlm, \
           palette = sns.diverging_palette(220, 20, n = 5)).set_title("# of Features Distribution in DLM")
plt.legend(loc = 1)


plt.subplot(414)
dis_autohome = count_distribution(wide_info_autohome, 'd_deal_flag', 'feature_vol_autohome_bin')
sns.barplot( x = 'feature_vol_autohome_bin', y = 'rate', hue = 'd_deal_flag', data = dis_autohome, \
           palette = sns.diverging_palette(220, 20, n = 5)).set_title("# of Features Distribution in Autohome")
plt.legend(loc = 1)


plt.show()

In [None]:
# distribution by lead source
plt.figure(figsize=(12, 5))

dis_all = count_distribution(wide_info_all, 'c_lead_sources_汽车之家', 'feature_vol_all_bin')
sns.barplot( x = 'feature_vol_all_bin', y = 'rate', hue = 'c_lead_sources_汽车之家', data = dis_all, \
           palette = sns.diverging_palette(220, 20, n = 5)).set_title("# of Features Distribution in Wide-info (if autohome)")
plt.legend(loc = 1)

plt.show()

##### 2) 用户身上平均feature数量 by 成交/战败 and 线索来源 

In [None]:
wide_info_all[['c_lead_sources_汽车之家', 'feature_vol_all']].groupby(['c_lead_sources_汽车之家'], as_index = False).mean()

In [None]:
wide_info_all[['c_lead_sources_汽车之家','d_deal_flag', 'feature_vol_all']].groupby(['c_lead_sources_汽车之家', 'd_deal_flag'], as_index = False).mean()

In [None]:
wide_info_all[['d_deal_flag', 'feature_vol_all']].groupby(['d_deal_flag'], as_index = False).mean()

##### 从feature角度出发检查每个feature的覆盖率

In [10]:
def cov_rate(data, name):
    '''
    Return a dataframe contains coverage rate of all features in a dataframe given:
    data(dataframe): input dataframe;
    name(string): name of output column
    '''
    cov_df = pd.DataFrame((data.shape[0] - data.isna().sum()) / data.shape[0] * 100, columns=[name]).reset_index()
    return cov_df

In [11]:
autohome_id = pd.DataFrame(wide_info_autohome[wide_info_autohome['feature_vol_autohome'] > 0]['mobile'])

feature_cov = cov_rate(wide_info_all, 'cov_rate_all').\
merge(cov_rate(wide_info_all[wide_info_all['d_deal_flag'] == 1], 'cov_rate_deal'), how = 'left').\
merge(cov_rate(wide_info_all[wide_info_all['d_deal_flag'] == 0], 'cov_rate_failed'), how = 'left').\
merge(cov_rate(wide_info_all[(wide_info_all['c_lead_sources_汽车之家'] == 1) & (wide_info_all['mobile'].isin(autohome_id['mobile']))], 'cov_rate_autohome'), how = 'left').\
merge(cov_rate(wide_info_all[wide_info_all['c_lead_sources_汽车之家'] == 0], 'cov_rate_others'), how = 'left')

feature_cov.to_csv('cov_rate.csv', index = False)

In [14]:
# drop features whose cov rate <= 10%
feature_cov[feature_cov['cov_rate_autohome'] <= 10].sort_values(by = 'cov_rate_autohome')[['index','cov_rate_autohome']].reset_index(drop = True)\
#.to_csv('autohome_drop_low_cov.csv', index = False)

Unnamed: 0,index,cov_rate_autohome
0,d_dealf_succ_lastvisit_diff,0.038212
1,d_lastfollow_dealf_diff,0.497368
2,d_firfollow_dealf_diff,0.497368
3,c_last_reach_platform_MG官网,1.252974
4,c_last_reach_platform_MGAPP,1.252974
5,c_last_reach_platform_MG服务号,1.252974
6,d_fir_order_trail_diff,1.477313
7,d_dealf_succ_firvisit_diff,1.997485
8,d_fir_order_visit_diff,2.554637
9,d_trail_count_d30,2.618734


In [15]:
# check distribution of features
feature_cov[feature_cov['cov_rate_others'] <= 10].sort_values(by = 'cov_rate_others')[['index','cov_rate_others']].reset_index(drop = True)\
#.to_csv('others_drop_low_cov.csv', index = False)

Unnamed: 0,index,cov_rate_others
0,h_car_type_prefer12,0.044780
1,h_car_type_prefer14,0.044780
2,h_car_type_prefer4,0.044780
3,h_car_type_prefer3,0.044780
4,h_car_type_prefer2,0.044780
5,h_car_type_prefer1,0.044780
6,h_car_type_prefer11,0.044780
7,h_car_type_prefer15,0.044780
8,h_car_type_prefer13,0.044780
9,h_car_type_prefer6,0.044780


##### feature初步选择

In [33]:
# drop autohome features
wide_info_others = wide_info_all[wide_info_all['c_lead_sources_汽车之家'] == 0].\
drop(['feature_vol_all', 'feature_vol_all_bin'] + autohome_feature_list_ad + [i.encode('utf-8') for i in list(feature_list[feature_list['col_name'].str.contains('h_')]['col_name'])], axis = 1)

In [34]:
wide_info_others.to_csv('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/wide_info_others.csv', index = False)