In [2]:
import findspark
findspark.init("/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/spark")

import os
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_181-cloudera"

from pyspark import SparkContext
from pyspark.sql import SparkSession,HiveContext,Window
from pyspark.sql import functions as fn
from pyspark.sql.types import IntegerType, FloatType, DoubleType, ArrayType, StringType, DecimalType,MapType

spark_session = SparkSession.builder.enableHiveSupport().appName("test").config("spark.driver.memory","30g").getOrCreate()
hc = HiveContext(spark_session.sparkContext)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [4]:
# load column names and comments
wide_info_list = hc.sql('DESCRIBE marketing_modeling.mm_big_wide_info').toPandas()
feature_list = hc.sql('DESCRIBE marketing_modeling.wide_info_feature').toPandas()
feature_list = feature_list[['col_name']].merge(wide_info_list[['col_name','comment']], how = 'inner', on = 'col_name')

In [20]:
#feature_list.to_csv('feature_list.csv',encoding='utf-8')

##### Selection of ids with leads sources of autohome

In [3]:
wide_info_all = pd.read_csv('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/wide_info_all.csv', delimiter = ',', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [4]:
len(wide_info_all)

1991510

In [6]:
wide_info_all['mobile'] = wide_info_all['mobile'].apply(lambda x: str(x))

In [7]:
len(wide_info_all)

1991510

In [8]:
auto_info_all = wide_info_all[wide_info_all['c_lead_sources_汽车之家']==1]

In [9]:
len(auto_info_all)

644936

##### Selection of  ids in autohome feature table

In [10]:
cdp_feature_list_ad = ["c_city","c_province","c_city_level","c_MG350（海外销售）","c_MGeHS","c_MGGT","c_MG锐腾","c_MG550（海外销售）","c_MGeMGHS",
"c_MGHS","c_MG360（海外销售）","c_MGRX5（海外销售）","c_MGeMG6","c_MG7","c_MG750（海外销售）","c_MG5","c_MGZS纯电动","c_MGZS","c_MG3",
"c_MG新MG5（海外销售）","c_MGTF","c_MG6",
"c_last_reach_platform_MG服务号","c_last_reach_platform_MGAPP","c_last_reach_platform_MG官网","c_lead_sources_汽车之家",
"c_vertical_media","c_offcial_online",
"c_leads_source_nums"]

autohome_feature_list_ad = ['h_func_prefer1', 'h_func_prefer2', 'h_func_prefer3','h_func_prefer4', 'h_func_prefer5', 'h_func_prefer6',
                            'h_func_prefer7', 'h_func_prefer8', 'h_func_prefer9','h_car_type_prefer1', 'h_car_type_prefer2', 
                            'h_car_type_prefer3','h_car_type_prefer4', 'h_car_type_prefer5', 'h_car_type_prefer6','h_car_type_prefer7', 
                            'h_car_type_prefer8', 'h_car_type_prefer9','h_car_type_prefer10', 'h_car_type_prefer11', 'h_car_type_prefer12',
                            'h_car_type_prefer13', 'h_car_type_prefer14', 'h_car_type_prefer15','h_car_type_prefer16', 'h_config_prefer1', 
                            'h_config_prefer2', 'h_config_prefer3', 'h_config_prefer4', 'h_config_prefer5', 'h_config_prefer6', 
                            'h_config_prefer7', 'h_config_prefer8', 'h_config_prefer9', 'h_config_prefer10', 'h_config_prefer11',
                            'h_config_prefer12', 'h_config_prefer13', 'h_config_prefer14','h_config_prefer15', 'h_config_prefer16', 
                            'h_config_prefer17', 'h_config_prefer18', 'h_budget_min', 'h_budget_min_cat', 'h_budget_max', 'h_budget_max_cat', 
                            'h_level_cat']

# DLM小宽表
wide_info_dlm = wide_info_all[[i.encode('utf-8') for i in list(feature_list[feature_list['col_name'].str.contains('d_')]['col_name'])]]
#wide_info_dlm['feature_vol_dlm'] = wide_info_dlm.shape[1] - wide_info_dlm.isnull().sum(axis=1) - 2

# autohome小宽表
wide_info_autohome = wide_info_all[autohome_feature_list_ad + \
                                   [i.encode('utf-8') for i in list(feature_list[feature_list['col_name'].str.contains('h_')]['col_name'])]]
#wide_info_autohome['feature_vol_autohome'] = wide_info_autohome.shape[1] - wide_info_autohome.isnull().sum(axis=1) - 2

# CDP小宽表 
wide_info_cdp = wide_info_all[cdp_feature_list_ad + \
                              [i.encode('utf-8') for i in list(feature_list[(feature_list['col_name'].str.contains('c_') == True) & \
                                                                        (feature_list['col_name'].str.contains('d_') == False)]['col_name'])]]
#wide_info_cdp['feature_vol_cdp'] = wide_info_cdp.shape[1] - wide_info_cdp.isnull().sum(axis=1) - 2

In [15]:
auto_miss = auto_info_all[auto_info_all[wide_info_autohome.columns.to_list()].isnull().T.all()==True]

In [16]:
auto_selection = auto_info_all[auto_info_all['mobile'].isin(auto_miss['mobile'].to_list())==False]

In [17]:
len(auto_selection)

162254

In [21]:
len(auto_miss)

482682

In [22]:
auto_selection.to_csv('/home/mam_jupyter/jupyter_dir/artefact/leads_scoring_model/wide_info_auto.csv')

##### get the d_last_leads_time

In [26]:
llt = hc.sql('''
SELECT 
    mobile,
    d_last_leads_time
FROM marketing_modeling.mm_big_wide_info
WHERE 
    SUBSTR(mobile, 0, 1) = 1
    AND LENGTH(mobile) = 11
    AND d_deal_flag is not null
''').toPandas()

In [27]:
len(llt)

1991510

In [28]:
llt['mobile']=llt['mobile'].apply(lambda x:str(x))

In [44]:
auto_miss = auto_miss.merge(llt,left_on = 'mobile',right_on = 'mobile')

In [46]:
len(auto_miss[auto_miss['d_last_leads_time'].isnull()==True])

48201

In [59]:
auto_miss_notnull = auto_miss[auto_miss['d_last_leads_time'].isnull()==False]

In [47]:
auto_miss['d_last_leads_time'].max()

Timestamp('2020-05-31 22:56:19')

In [60]:
auto_miss_notnull['d_last_leads_month']=auto_miss_notnull['d_last_leads_time'].apply(lambda x: x.strftime('%Y-%m'))

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/cloudera/parcels/CDH-6.3.1-1.cdh6.3.1.p0.1470567/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
Py4JNetworkError: Error while receiving
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [61]:
auto_miss_notnull['d_last_leads_month'].value_counts()

2018-04    31119
2018-01    29121
2018-03    28999
2018-05    26126
2018-09    25706
2018-06    24320
2018-10    23596
2018-12    23357
2018-08    22772
2018-02    22338
2018-11    22101
2018-07    21805
2019-01    20904
2019-04    18223
2019-03    17615
2019-02    17015
2019-06     8631
2019-05     7765
2019-07     6753
2019-10     6024
2019-08     5896
2019-09     5201
2019-11     4328
2019-12     4077
2020-03     2719
2020-01     2691
2020-02     2353
2020-04     1878
2020-05     1048
Name: d_last_leads_month, dtype: int64