In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.sql import Row
import pandas as pd
from sklearn import metrics

In [2]:
import os
os.chdir(r"C:\Users\REGGIE\Desktop\student\数据分析资料\Spark学习\data")

In [3]:
 appname = "GBTClassifier"
master ="local" 
conf = SparkConf().setAppName(appname).setMaster(master)  #spark配置                
spark=SparkSession.builder.config(conf=conf).getOrCreate()#spark实例化

In [4]:
#读取数据
data=spark.read.csv('broadband.csv',header=True)

In [5]:
#转换数据类型
from pyspark.sql.types import IntegerType
for i in data.columns:
    data = data.withColumn(i, data[i].cast(IntegerType()))
    
data.printSchema()

root
 |-- CUST_ID: integer (nullable = true)
 |-- GENDER: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- TENURE: integer (nullable = true)
 |-- CHANNEL: integer (nullable = true)
 |-- AUTOPAY: integer (nullable = true)
 |-- ARPB_3M: integer (nullable = true)
 |-- CALL_PARTY_CNT: integer (nullable = true)
 |-- DAY_MOU: integer (nullable = true)
 |-- AFTERNOON_MOU: integer (nullable = true)
 |-- NIGHT_MOU: integer (nullable = true)
 |-- AVG_CALL_LENGTH: integer (nullable = true)
 |-- BROADBAND: integer (nullable = true)



In [6]:
#构造features和label
dataset= data.rdd.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[1:-1]))).toDF()    
train_num = dataset.count()
print("样本数:{}".format(train_num))

样本数:1114


In [7]:
dataset.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,34.0,27.0,2....|    1|
|[0.0,62.0,58.0,1....|    1|
|[1.0,39.0,55.0,3....|    0|
|[1.0,39.0,55.0,3....|    0|
|[1.0,39.0,55.0,3....|    0|
+--------------------+-----+
only showing top 5 rows



In [8]:
#分割数据集
train_df , test_df = dataset.randomSplit([0.75, 0.25])
print('train set (%d, %d)'%(train_df.count(), len(train_df.columns)))
print('test set (%d, %d)'%(test_df.count(), len(test_df.columns)))

train set (859, 2)
test set (255, 2)


In [12]:
#使用模型
rf = GBTClassifier(maxIter=50, labelCol="label", seed=7)

In [13]:
model = rf.fit(train_df)

In [14]:
#输出模型特征重要性、子树权重
print("模型特征重要性:{}".format(model.featureImportances))
print("模型特征数:{}".format(model.numFeatures))

模型特征重要性:(11,[0,1,2,3,4,5,6,7,8,9,10],[0.03128282466055808,0.1615129242019249,0.1600791739706802,0.03847794808214459,0.029869783871444615,0.21065720491998208,0.10274378955063096,0.07631973881407213,0.09327643341585982,0.045304765666923236,0.050475412845779485])
模型特征数:11


In [15]:
model_predict = model.transform(test_df)

In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator      #auc

GBDT_accuracy = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy').evaluate(model_predict)
GBDT_precision = MulticlassClassificationEvaluator(labelCol='label', metricName='weightedPrecision').evaluate(model_predict)
GBDT_auc = BinaryClassificationEvaluator(labelCol='label').evaluate(model_predict)
print("GBDT's accuracy is %f"%rf_accuracy)
print("GBDT's precision is %f"%rf_precision)
print("GBDT's precision is %f"%rf_auc)

RF's accuracy is 0.925490
RF's precision is 0.922649
RF's precision is 0.875527


In [18]:
model_predict.show(100)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,19.0,11.0,1....|    0|[1.28764563229668...|[0.92925433848003...|       0.0|
|[0.0,19.0,11.0,1....|    0|[1.28764563229668...|[0.92925433848003...|       0.0|
|[0.0,20.0,1.0,1.0...|    0|[1.67236129231659...|[0.96593159436666...|       0.0|
|[0.0,21.0,43.0,2....|    0|[1.30061628881052...|[0.93094086414027...|       0.0|
|[0.0,22.0,42.0,4....|    0|[1.46340091350368...|[0.94915555504048...|       0.0|
|[0.0,22.0,42.0,4....|    0|[1.46340091350368...|[0.94915555504048...|       0.0|
|[0.0,24.0,4.0,1.0...|    0|[1.07523502219277...|[0.89571269258325...|       0.0|
|[0.0,24.0,6.0,3.0...|    0|[0.17840834185133...|[0.58826962511894...|       0.0|
|[0.0,26.0,11.0,4....|    0|[1.76045273381727...|[0.97127677577354...|       0.0|
|[0.0,26.0,11.0,

In [19]:
#sklearn中使用GBDT
from sklearn.model_selection import train_test_split
import sklearn.ensemble as ensemble
import pandas as pd
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV

In [25]:
#读取数据
model_data=pd.read_csv('broadband.csv')

In [26]:
#划分训练集和测试集
X = model_data.iloc[ :,1:-1]
Y = model_data['BROADBAND']
train_data, test_data, train_target, test_target = train_test_split(X,Y,test_size=0.25, train_size=0.75, random_state=12345)
print(len(train_data),len(test_data))

835 279


In [44]:
#GBDT
param_grid = {
    'loss':['deviance','exponential'],
    'learning_rate':[0.1,0.3,0.5,0.7,1],
    'n_estimators':[10,15,20,30,40,50],  #决策树个数-GBDT特有参数
    'max_depth':[1,2,3,4,5,6,7,8],  #单棵树最大深度-GBDT特有参数
    'min_samples_split':[2,4,8,12] 
    
}

In [45]:
gbc = ensemble.GradientBoostingClassifier()
gbccv = GridSearchCV(estimator=gbc, param_grid=param_grid, scoring='roc_auc', cv=4)
gbccv.fit(train_data, train_target)

GridSearchCV(cv=4, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_c...
                 

In [46]:
predict = gbccv.predict(test_data)

In [47]:
print("gradient boosting accuracy:")
print(metrics.classification_report(test_target,predict))
print("gradient boosting AUC:")
fpr_test, tpr_test, th_test = metrics.roc_curve(test_target, predict)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))

gradient boosting accuracy:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       229
           1       0.78      0.50      0.61        50

    accuracy                           0.89       279
   macro avg       0.84      0.73      0.77       279
weighted avg       0.88      0.89      0.87       279

gradient boosting AUC:
AUC = 0.7347


In [48]:
#%%
gbccv.best_params_
#确认模型最佳参数，可以使用最佳参数，进行最优模型的建立

{'learning_rate': 0.5,
 'loss': 'exponential',
 'max_depth': 7,
 'min_samples_split': 4,
 'n_estimators': 50}