In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('log_reg').getOrCreate()

In [2]:
df=spark.read.csv('log_reg.csv',inferSchema=True,header=True)

In [3]:
from pyspark.sql.functions import *

In [4]:
print((df.count(),len(df.columns)))

(15056, 6)


In [5]:
df.printSchema()

root
 |-- 城市: string (nullable = true)
 |-- 年龄: integer (nullable = true)
 |-- 是否第一次访问: integer (nullable = true)
 |-- 电商平台: string (nullable = true)
 |-- 搜索产品数: integer (nullable = true)
 |-- 是否购买: integer (nullable = true)



In [6]:
df.show(5)

+----+----+--------------+--------+----------+--------+
|城市|年龄|是否第一次访问|电商平台|搜索产品数|是否购买|
+----+----+--------------+--------+----------+--------+
|广州|  38|             1|  TaoBao|        12|       0|
|深圳|  29|             1|  TaoBao|        11|       0|
|深圳|  37|             1|JingDong|         2|       0|
|上海|  32|             1|  SuNing|         1|       0|
|北京|  33|             1|JingDong|         3|       0|
+----+----+--------------+--------+----------+--------+
only showing top 5 rows



In [7]:
df.describe().show()

+-------+-----+-----------------+------------------+--------+-----------------+-------------------+
|summary| 城市|             年龄|    是否第一次访问|电商平台|       搜索产品数|           是否购买|
+-------+-----+-----------------+------------------+--------+-----------------+-------------------+
|  count|15056|            15056|             15056|   15056|            15056|              15056|
|   mean| null|28.51268597236982|0.5001992561105207|    null|9.529357066950054|0.49787460148777896|
| stddev| null|7.878922594747357|0.5000165657987499|    null|6.075813641256771| 0.5000120880138007|
|    min| 上海|               14|                 0|JingDong|                1|                  0|
|    max| 深圳|              111|                 1|  TaoBao|               29|                  1|
+-------+-----+-----------------+------------------+--------+-----------------+-------------------+



In [8]:
df.groupBy('城市').count().show()

+----+-----+
|城市|count|
+----+-----+
|深圳| 1947|
|上海| 9184|
|北京|  887|
|广州| 3038|
+----+-----+



In [9]:
df.groupBy('电商平台').count().show()

+--------+-----+
|电商平台|count|
+--------+-----+
|JingDong| 4321|
|  TaoBao| 7432|
|  SuNing| 3303|
+--------+-----+



In [10]:
df.groupBy('是否购买').count().show()

+--------+-----+
|是否购买|count|
+--------+-----+
|       1| 7496|
|       0| 7560|
+--------+-----+



In [11]:
df.groupBy('城市').mean().show()

+----+------------------+-------------------+------------------+-------------------+
|城市|         avg(年龄)|avg(是否第一次访问)|   avg(搜索产品数)|      avg(是否购买)|
+----+------------------+-------------------+------------------+-------------------+
|深圳| 30.26707755521315| 0.3261427837699024| 4.927067282999486|0.03697996918335902|
|上海|28.408101045296167|  0.518074912891986| 9.976698606271777| 0.5431184668989547|
|北京|27.704622322435174|  0.560315670800451|11.082299887260428|  0.644870349492672|
|广州| 27.94042132982225| 0.5401579986833444|10.673140223831467| 0.6135615536537196|
+----+------------------+-------------------+------------------+-------------------+



In [12]:
df.groupBy('电商平台').mean().show()

+--------+------------------+-------------------+-----------------+-------------------+
|电商平台|         avg(年龄)|avg(是否第一次访问)|  avg(搜索产品数)|      avg(是否购买)|
+--------+------------------+-------------------+-----------------+-------------------+
|JingDong|28.370284656329552| 0.5123813931960194|9.809766257810692| 0.5223327933348761|
|  TaoBao|28.560145317545746|  0.503767491926803|9.515473627556512| 0.5006727664155005|
|  SuNing|28.592188919164396| 0.4762337269149258|9.193763245534363|0.45958219800181654|
+--------+------------------+-------------------+-----------------+-------------------+



In [13]:
from pyspark.ml.feature import StringIndexer

In [14]:
commerce_indexer = StringIndexer(inputCol="电商平台", outputCol="电商平台索引").fit(df)
df = commerce_indexer.transform(df)

In [15]:
df.show(10)

+----+----+--------------+--------+----------+--------+------------+
|城市|年龄|是否第一次访问|电商平台|搜索产品数|是否购买|电商平台索引|
+----+----+--------------+--------+----------+--------+------------+
|广州|  38|             1|  TaoBao|        12|       0|         0.0|
|深圳|  29|             1|  TaoBao|        11|       0|         0.0|
|深圳|  37|             1|JingDong|         2|       0|         1.0|
|上海|  32|             1|  SuNing|         1|       0|         2.0|
|北京|  33|             1|JingDong|         3|       0|         1.0|
|深圳|  33|             1|JingDong|        18|       0|         1.0|
|深圳|  29|             0|JingDong|         9|       0|         1.0|
|上海|  24|             1|JingDong|         5|       1|         1.0|
|上海|  29|             0|  TaoBao|        10|       0|         0.0|
|上海|  28|             0|  SuNing|         3|       1|         2.0|
+----+----+--------------+--------+----------+--------+------------+
only showing top 10 rows



In [16]:
from pyspark.ml.feature import OneHotEncoder

In [17]:
commerce_vector = OneHotEncoder(inputCol="电商平台索引", outputCol="电商平台索引向量")
df = commerce_vector.transform(df)

In [18]:
df.show(3,False)

+----+----+--------------+--------+----------+--------+------------+----------------+
|城市|年龄|是否第一次访问|电商平台|搜索产品数|是否购买|电商平台索引|电商平台索引向量|
+----+----+--------------+--------+----------+--------+------------+----------------+
|广州|38  |1             |TaoBao  |12        |0       |0.0         |(2,[0],[1.0])   |
|深圳|29  |1             |TaoBao  |11        |0       |0.0         |(2,[0],[1.0])   |
|深圳|37  |1             |JingDong|2         |0       |1.0         |(2,[1],[1.0])   |
+----+----+--------------+--------+----------+--------+------------+----------------+
only showing top 3 rows



In [21]:
df.groupBy('电商平台索引向量').count().orderBy('count',ascending=False).show(5,False)

+----------------+-----+
|电商平台索引向量|count|
+----------------+-----+
|(2,[0],[1.0])   |7432 |
|(2,[1],[1.0])   |4321 |
|(2,[],[])       |3303 |
+----------------+-----+



In [23]:
df.groupBy('电商平台索引向量').count().orderBy('count',ascending=False).show(5,False)

+----------------+-----+
|电商平台索引向量|count|
+----------------+-----+
|(2,[0],[1.0])   |7432 |
|(2,[1],[1.0])   |4321 |
|(2,[],[])       |3303 |
+----------------+-----+



In [28]:
df.show()

+----+----+--------------+--------+----------+--------+------------+----------------+--------+-------------+
|城市|年龄|是否第一次访问|电商平台|搜索产品数|是否购买|电商平台索引|电商平台索引向量|城市索引| 城市索引向量|
+----+----+--------------+--------+----------+--------+------------+----------------+--------+-------------+
|广州|  38|             1|  TaoBao|        12|       0|         0.0|   (2,[0],[1.0])|     1.0|(3,[1],[1.0])|
|深圳|  29|             1|  TaoBao|        11|       0|         0.0|   (2,[0],[1.0])|     2.0|(3,[2],[1.0])|
|深圳|  37|             1|JingDong|         2|       0|         1.0|   (2,[1],[1.0])|     2.0|(3,[2],[1.0])|
|上海|  32|             1|  SuNing|         1|       0|         2.0|       (2,[],[])|     0.0|(3,[0],[1.0])|
|北京|  33|             1|JingDong|         3|       0|         1.0|   (2,[1],[1.0])|     3.0|    (3,[],[])|
|深圳|  33|             1|JingDong|        18|       0|         1.0|   (2,[1],[1.0])|     2.0|(3,[2],[1.0])|
|深圳|  29|             0|JingDong|         9|       0|         1.0|   (2,[1],[1.

In [30]:
from pyspark.ml.feature import VectorAssembler

In [31]:
df_assembler = VectorAssembler(inputCols=['电商平台索引向量','城市索引向量','年龄', '是否第一次访问','搜索产品数'], outputCol="特征值向量")
df = df_assembler.transform(df)

In [32]:
df.printSchema()

root
 |-- 城市: string (nullable = true)
 |-- 年龄: integer (nullable = true)
 |-- 是否第一次访问: integer (nullable = true)
 |-- 电商平台: string (nullable = true)
 |-- 搜索产品数: integer (nullable = true)
 |-- 是否购买: integer (nullable = true)
 |-- 电商平台索引: double (nullable = false)
 |-- 电商平台索引向量: vector (nullable = true)
 |-- 城市索引: double (nullable = false)
 |-- 城市索引向量: vector (nullable = true)
 |-- 特征值向量: vector (nullable = true)



In [33]:
df.select(['特征值向量','是否购买']).show(10,False)

+-----------------------------------+--------+
|特征值向量                         |是否购买|
+-----------------------------------+--------+
|[1.0,0.0,0.0,1.0,0.0,38.0,1.0,12.0]|0       |
|[1.0,0.0,0.0,0.0,1.0,29.0,1.0,11.0]|0       |
|[0.0,1.0,0.0,0.0,1.0,37.0,1.0,2.0] |0       |
|(8,[2,5,6,7],[1.0,32.0,1.0,1.0])   |0       |
|(8,[1,5,6,7],[1.0,33.0,1.0,3.0])   |0       |
|[0.0,1.0,0.0,0.0,1.0,33.0,1.0,18.0]|0       |
|(8,[1,4,5,7],[1.0,1.0,29.0,9.0])   |0       |
|[0.0,1.0,1.0,0.0,0.0,24.0,1.0,5.0] |1       |
|(8,[0,2,5,7],[1.0,1.0,29.0,10.0])  |0       |
|(8,[2,5,7],[1.0,28.0,3.0])         |1       |
+-----------------------------------+--------+
only showing top 10 rows



In [34]:
model_df=df.select(['特征值向量','是否购买'])

In [35]:
from pyspark.ml.classification import LogisticRegression

In [36]:
training_df,test_df=model_df.randomSplit([0.7,0.3])

In [37]:
training_df.count()

10490

In [38]:
training_df.groupBy('是否购买').count().show()

+--------+-----+
|是否购买|count|
+--------+-----+
|       1| 5239|
|       0| 5251|
+--------+-----+



In [39]:
test_df.count()

4566

In [40]:
test_df.groupBy('是否购买').count().show()

+--------+-----+
|是否购买|count|
+--------+-----+
|       1| 2257|
|       0| 2309|
+--------+-----+



In [41]:
log_reg=LogisticRegression(featuresCol='特征值向量',labelCol='是否购买').fit(training_df)

In [42]:
from pyspark.ml import Estimator
from pyspark.ml import Transformer
isinstance(log_reg,Transformer)

True

In [43]:
train_results=log_reg.evaluate(training_df).predictions

In [44]:
train_results.filter(train_results['是否购买']==1).filter(train_results['prediction']==1).select(['是否购买','prediction','probability']).show(10,False)

+--------+----------+----------------------------------------+
|是否购买|prediction|probability                             |
+--------+----------+----------------------------------------+
|1       |1.0       |[0.2967915035494824,0.7032084964505176] |
|1       |1.0       |[0.1681037900675889,0.8318962099324111] |
|1       |1.0       |[0.1681037900675889,0.8318962099324111] |
|1       |1.0       |[0.08821488317232305,0.9117851168276769]|
|1       |1.0       |[0.08821488317232305,0.9117851168276769]|
|1       |1.0       |[0.04427156521457076,0.9557284347854291]|
|1       |1.0       |[0.04427156521457076,0.9557284347854291]|
|1       |1.0       |[0.04427156521457076,0.9557284347854291]|
|1       |1.0       |[0.02169724782921141,0.9783027521707887]|
|1       |1.0       |[0.02169724782921141,0.9783027521707887]|
+--------+----------+----------------------------------------+
only showing top 10 rows



Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class

In [45]:
correct_preds=train_results.filter(train_results['是否购买']==1).filter(train_results['prediction']==1).count()

In [46]:
training_df.filter(training_df['是否购买']==1).count()

5239

In [47]:
float(correct_preds)/(training_df.filter(training_df['是否购买']==1).count())

0.9366291276961252

In [48]:
print('{}{}'.format('预测准确率：',log_reg.evaluate(training_df).accuracy) )  

预测准确率：0.939084842707


In [49]:
#Test Set results
from pyspark.ml.evaluation import BinaryClassificationEvaluator
isinstance(log_reg.evaluate,BinaryClassificationEvaluator)

False

In [50]:
results=log_reg.evaluate(test_df).predictions

In [51]:
results.select(['是否购买','prediction']).show(10,False)

+--------+----------+
|是否购买|prediction|
+--------+----------+
|0       |0.0       |
|0       |0.0       |
|0       |0.0       |
|0       |0.0       |
|0       |0.0       |
|1       |0.0       |
|0       |1.0       |
|1       |1.0       |
|1       |1.0       |
|1       |1.0       |
+--------+----------+
only showing top 10 rows



In [52]:
results.printSchema()

root
 |-- 特征值向量: vector (nullable = true)
 |-- 是否购买: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [53]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
results[results['是否购买']==1]

DataFrame[特征值向量: vector, 是否购买: int, rawPrediction: vector, probability: vector, prediction: double]

In [54]:
#confusion matrix
true_postives = results[(results['是否购买'] == 1) & (results.prediction == 1)].count()
true_negatives = results[(results['是否购买'] == 0) & (results.prediction == 0)].count()
false_positives = results[(results['是否购买'] == 0) & (results.prediction == 1)].count()
false_negatives = results[(results['是否购买'] == 1) & (results.prediction == 0)].count()

In [55]:
print (true_postives)
print (true_negatives)
print (false_positives)
print (false_negatives)
print(true_postives+true_negatives+false_positives+false_negatives)
print (results.count())

2121
2158
151
136
4566
4566


In [56]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

0.93974302171


In [57]:
precision = float(true_postives) / (true_postives + false_positives)
print(precision)

0.933538732394


In [58]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

0.0


In [59]:
results.count()

4566

In [60]:
true_postives+true_negatives

4279