In [59]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import Row
import pandas as pd
from sklearn import metrics

In [4]:
import os
os.chdir(r"C:\Users\REGGIE\Desktop\student\数据分析资料\Spark学习\data")

In [61]:
appname = "RandomForestClassifier"
master ="local[4]" 
conf = SparkConf().setAppName(appname).setMaster(master)  #spark配置                
spark=SparkSession.builder.config(conf=conf).getOrCreate()#spark实例化
#读取数据
data=spark.read.csv('broadband.csv',header=True)

In [62]:
data

DataFrame[CUST_ID: string, GENDER: string, AGE: string, TENURE: string, CHANNEL: string, AUTOPAY: string, ARPB_3M: string, CALL_PARTY_CNT: string, DAY_MOU: string, AFTERNOON_MOU: string, NIGHT_MOU: string, AVG_CALL_LENGTH: string, BROADBAND: string]

In [63]:
#转换数据类型
from pyspark.sql.types import IntegerType
for i in data.columns:
    data = data.withColumn(i, data[i].cast(IntegerType()))
    
data.printSchema()

root
 |-- CUST_ID: integer (nullable = true)
 |-- GENDER: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- TENURE: integer (nullable = true)
 |-- CHANNEL: integer (nullable = true)
 |-- AUTOPAY: integer (nullable = true)
 |-- ARPB_3M: integer (nullable = true)
 |-- CALL_PARTY_CNT: integer (nullable = true)
 |-- DAY_MOU: integer (nullable = true)
 |-- AFTERNOON_MOU: integer (nullable = true)
 |-- NIGHT_MOU: integer (nullable = true)
 |-- AVG_CALL_LENGTH: integer (nullable = true)
 |-- BROADBAND: integer (nullable = true)



In [64]:
#构造features和label
dataset= data.rdd.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[1:-1]))).toDF()    
train_num = dataset.count()
print("样本数:{}".format(train_num))

样本数:1114


In [65]:
dataset.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,34.0,27.0,2....|    1|
|[0.0,62.0,58.0,1....|    1|
|[1.0,39.0,55.0,3....|    0|
|[1.0,39.0,55.0,3....|    0|
|[1.0,39.0,55.0,3....|    0|
+--------------------+-----+
only showing top 5 rows



In [66]:
#分割数据集
train_df , test_df = dataset.randomSplit([0.75, 0.25])
print('train set (%d, %d)'%(train_df.count(), len(train_df.columns)))
print('test set (%d, %d)'%(test_df.count(), len(test_df.columns)))

train set (843, 2)
test set (271, 2)


In [67]:
#使用模型
rf = RandomForestClassifier(labelCol='label', numTrees=50)

In [70]:
model = rf.fit(train_df)

In [72]:
model_predict = model.transform(test_df)

In [77]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator      #auc

rf_accuracy = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy').evaluate(model_predict)
rf_precision = MulticlassClassificationEvaluator(labelCol='label', metricName='weightedPrecision').evaluate(model_predict)
rf_auc = BinaryClassificationEvaluator(labelCol='label').evaluate(model_predict)
print("RF's accuracy is %f"%rf_accuracy)
print("RF's precision is %f"%rf_precision)
print("RF's precision is %f"%rf_auc)

RF's accuracy is 0.900369
RF's precision is 0.910918
RF's precision is 0.861179


In [78]:
model_predict.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,19.0,11.0,1....|    0|[44.7001148106879...|[0.89400229621375...|       0.0|
|[0.0,19.0,11.0,1....|    0|[44.7001148106879...|[0.89400229621375...|       0.0|
|[0.0,21.0,43.0,2....|    0|[37.9538126278311...|[0.75907625255662...|       0.0|
|[0.0,21.0,43.0,2....|    0|[37.9538126278311...|[0.75907625255662...|       0.0|
|[0.0,22.0,23.0,3....|    1|[41.3337303864649...|[0.82667460772929...|       0.0|
|[0.0,22.0,42.0,4....|    0|[42.0530765933963...|[0.84106153186792...|       0.0|
|[0.0,24.0,4.0,1.0...|    0|[44.0543265939989...|[0.88108653187997...|       0.0|
|[0.0,24.0,39.0,1....|    0|[35.9543248683327...|[0.71908649736665...|       0.0|
|[0.0,24.0,39.0,1....|    0|[35.9543248683327...|[0.71908649736665...|       0.0|
|[0.0,24.0,39.0,

In [5]:
#sklearn中的随机森林
#宽带营销的数据"broadband.csv"
from sklearn.model_selection import train_test_split
import sklearn.ensemble as ensemble
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

In [6]:
model_data = pd.read_csv("broadband.csv")
model_data.head()

Unnamed: 0,CUST_ID,GENDER,AGE,TENURE,CHANNEL,AUTOPAY,ARPB_3M,CALL_PARTY_CNT,DAY_MOU,AFTERNOON_MOU,NIGHT_MOU,AVG_CALL_LENGTH,BROADBAND
0,63,1,34,27,2,0,203,0,0.0,0.0,0.0,3.04,1
1,64,0,62,58,1,0,360,0,0.0,1910.0,0.0,3.3,1
2,65,1,39,55,3,0,304,0,437.2,200.3,0.0,4.92,0
3,66,1,39,55,3,0,304,0,437.2,182.8,0.0,4.92,0
4,67,1,39,55,3,0,304,0,437.2,214.5,0.0,4.92,0


In [30]:
#划分训练集和测试集
X = model_data.iloc[ :,1:-1]
Y = model_data['BROADBAND']
train_data, test_data, train_target, test_target = train_test_split(X,Y,test_size=0.25, train_size=0.75, random_state=12345)
print(len(train_data),len(test_data))

835 279


In [51]:
#带网格搜索的随机森林
#随机森林
param_grid = {
    'criterion':['entropy','gini'],
    'max_depth':[7,8,10,12,14,16,20],
    'n_estimators':[11,13,15,17,19,21,23],  #决策树个数-随机森林特有参数
    'max_features':[0.2,0.3,0.4,0.5,0.6], #每棵决策树使用的变量占比-随机森林特有参数
    'min_samples_split':[2,4,6,8,12,16] 
}

rfc = ensemble.RandomForestClassifier()
rfccv = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='roc_auc', cv=4)
rfccv.fit(train_data, train_target)

GridSearchCV(cv=4, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,...
                                              random_state=None, verbose=0,
                                   

In [52]:
predict = rfccv.predict(test_data)

In [53]:
print("random forest accuracy:")
print(metrics.classification_report(test_target,predict))
print("random forest AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, predict)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))

random forest accuracy:
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       229
           1       0.81      0.52      0.63        50

    accuracy                           0.89       279
   macro avg       0.86      0.75      0.79       279
weighted avg       0.89      0.89      0.88       279

random forest AUC:
AUC = 0.7469


In [54]:
#查看最佳参数
rfccv.best_params_

{'criterion': 'gini',
 'max_depth': 16,
 'max_features': 0.4,
 'min_samples_split': 2,
 'n_estimators': 19}