In [None]:
! pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [None]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -xzf spark-3.5.1-bin-hadoop3.tgz
!ls /content

--2024-05-05 03:52:43--  https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400446614 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.1-bin-hadoop3.tgz’


2024-05-05 03:53:00 (71.4 MB/s) - ‘spark-3.5.1-bin-hadoop3.tgz’ saved [400446614/400446614]

sample_data  spark-3.5.1-bin-hadoop3  spark-3.5.1-bin-hadoop3.tgz


In [None]:
import os
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

os.environ["PATH"] += os.pathsep + os.path.join(os.environ["SPARK_HOME"], 'bin')

In [None]:
!pip install findspark
!pip install -q findspark
!pip install py4j
!pip install pyspark



In [None]:
import findspark
findspark.init()

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import Row
from pyspark.sql.functions import col, isnan, when, count

import pandas as pd
from sklearn import metrics
from pyspark.ml.feature import StringIndexer, OneHotEncoder


In [None]:

from ucimlrepo import fetch_ucirepo

# fetch dataset
dry_bean = fetch_ucirepo(id=602)

# data (as pandas dataframes)
X = dry_bean.data.features
y = dry_bean.data.targets

# metadata
print(dry_bean.metadata)

# variable information
print(dry_bean.variables)

# from ucimlrepo import fetch_ucirepo

# # fetch dataset
# default_of_credit_card_clients = fetch_ucirepo(id=350)

# # data (as pandas dataframes)
# X = default_of_credit_card_clients.data.features
# y = default_of_credit_card_clients.data.targets

# # metadata
# print(default_of_credit_card_clients.metadata)

# # variable information
# print(default_of_credit_card_clients.variables)



{'uci_id': 602, 'name': 'Dry Bean', 'repository_url': 'https://archive.ics.uci.edu/dataset/602/dry+bean+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/602/data.csv', 'abstract': 'Images of 13,611 grains of 7 different registered dry beans were taken with a high-resolution camera. A total of 16 features; 12 dimensions and 4 shape forms, were obtained from the grains.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 13611, 'num_features': 16, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Thu Mar 28 2024', 'dataset_doi': '10.24432/C50S4B', 'creators': [], 'intro_paper': {'title': 'Multiclass classification of dry beans using computer vision and machine learning techniques', 'authors': 'M. Koklu, Ilker Ali Özkan', 'published_in': 'Computers and Electronic

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir("/content/drive/My Drive/5003")

In [None]:
df = pd.concat([X, y], axis=1)

csv_file_path = '/content/drive/My Drive/5003/dataset_default.csv'  
df.to_csv(csv_file_path, index=False)

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf


appname = "RandomForestClassifier"
master = "local[4]"
conf = SparkConf().setAppName(appname).setMaster(master)
spark = SparkSession.builder.config(conf=conf).getOrCreate()

data = spark.read.csv(csv_file_path, header=True, inferSchema=True)  

In [None]:

dataSet = data.na.fill('0').rdd.map(list)
trainData, testData= dataSet.randomSplit([0.7, 0.3], seed=7)
trainingSet = trainData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()
train_num = trainingSet.count()
print("训练样本数:{}".format(train_num))

训练样本数:9538


In [None]:
import time

In [None]:
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingSet)
train_tf = si_model.transform(trainingSet)
train_tf.show(5)

start_time_rf = time.time() 

rf = RandomForestClassifier(
    numTrees=100,
    maxDepth=10, 
    maxBins=32, 
    featureSubsetStrategy="auto", 
    labelCol="indexed",
    seed=7
)
rfModel = rf.fit(train_tf)


end_time_rf = time.time() 
duration_rf = end_time_rf - start_time_rf  


+-----+--------------------+-------+
|label|            features|indexed|
+-----+--------------------+-------+
|SEKER|[28395.0,610.291,...|    2.0|
|SEKER|[28734.0,638.018,...|    2.0|
|SEKER|[29380.0,624.11,2...|    2.0|
|SEKER|[30008.0,645.884,...|    2.0|
|SEKER|[30140.0,620.134,...|    2.0|
+-----+--------------------+-------+


In [None]:
print("模型特征重要性:{}".format(rfModel.featureImportances))
print("模型特征数:{}".format(rfModel.numFeatures))


模型特征重要性:(16,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],[0.06793265298414411,0.09129955230251494,0.07089378718392163,0.09230719115407454,0.05160536654564456,0.07909676360922413,0.07738561228464973,0.04814301626654028,0.007882199239551153,0.013586361743283499,0.053964295918658305,0.10627326802447228,0.0761770380607167,0.04665455821823015,0.09080602499718646,0.02599231146718751])
模型特征数:16


In [None]:
testSet = testData.map(lambda x:Row(label=x[-1], features=Vectors.dense(x[:-1]))).toDF()
test_num=testSet.count()
print("测试样本数:{}".format(test_num))
si_model = stringIndexer.fit(testSet)
test_tf = si_model.transform(testSet)
predictResult = rfModel.transform(test_tf)
predictResult.show(5)
# spark.stop()

测试样本数:4073
+-----+--------------------+-------+--------------------+--------------------+----------+
|label|            features|indexed|       rawPrediction|         probability|prediction|
+-----+--------------------+-------+--------------------+--------------------+----------+
|SEKER|[30477.0,670.033,...|    2.0|[1.00702201791994...|[0.01007022017919...|       2.0|
|SEKER|[31675.0,657.431,...|    2.0|[99.5792117027196...|[0.99579211702719...|       0.0|
|SEKER|[31811.0,642.092,...|    2.0|[2.00407280327609...|[0.02004072803276...|       2.0|
|SEKER|[31823.0,662.532,...|    2.0|[3.33529990167158...|[0.03335299901671...|       2.0|
|SEKER|[31992.0,640.338,...|    2.0|[0.02986068388708...|[2.98606838870856...|       2.0|
+-----+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows


In [None]:
columns=predictResult.columns 
predictResult=predictResult.take(test_num) 
predictResult=pd.DataFrame(predictResult,columns=columns) 


In [None]:
y = list(predictResult['indexed'])
y_pred = list(predictResult['prediction'])
y_predprob = [x[1] for x in list(predictResult['probability'])]
precision_score = metrics.precision_score(y, y_pred, average='weighted')
recall_score = metrics.recall_score(y, y_pred, average='weighted')
accuracy_score = metrics.accuracy_score(y, y_pred)
f1_score = metrics.f1_score(y, y_pred, average='weighted')
# auc_score = metrics.roc_auc_score(y, y_predprob)



In [None]:
print("标签分布:", pd.Series(y).value_counts())


标签分布: 0.0    1058
1.0     807
2.0     595
3.0     560
4.0     486
5.0     388
6.0     179
Name: count, dtype: int64


In [None]:
print("随机森林模型性能评估：")
print("精确率:",precision_score )
print("召回率:",recall_score )
print("准确率:",accuracy_score )
print("F1分数:", f1_score)
print(f"随机森林训练时间：{duration_rf:.3f}秒")
#print("auc分数:",auc_score ) 

随机森林模型性能评估：
精确率: 0.9247320257580561
召回率: 0.924380063835011
准确率: 0.924380063835011
F1分数: 0.9244992615580041
随机森林训练时间：25.662秒


# Decision Tree

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(trainingSet)
train_tf = si_model.transform(trainingSet)
train_tf.show(5)

start_time_dt = time.time()  
dt = DecisionTreeClassifier(labelCol="indexed", seed=7)
dtModel = dt.fit(train_tf)
end_time_dt = time.time()
duration_dt = end_time_dt - start_time_dt 

print("决策树模型的树深度: {}".format(dtModel.depth))
print("决策树模型的节点数: {}".format(dtModel.numNodes))

+-----+--------------------+-------+
|label|            features|indexed|
+-----+--------------------+-------+
|SEKER|[28395.0,610.291,...|    2.0|
|SEKER|[28734.0,638.018,...|    2.0|
|SEKER|[29380.0,624.11,2...|    2.0|
|SEKER|[30008.0,645.884,...|    2.0|
|SEKER|[30140.0,620.134,...|    2.0|
+-----+--------------------+-------+
only showing top 5 rows

决策树模型的树深度: 5
决策树模型的节点数: 47


In [None]:
test_tf = si_model.transform(testSet)
predictResult_dt = dtModel.transform(test_tf)
predictResult_dt.show(5)

predictResult_list_dt = predictResult_dt.select("indexed", "prediction").collect()
y_true = [row['indexed'] for row in predictResult_list_dt]
y_pred = [row['prediction'] for row in predictResult_list_dt]

precision_score_dt = metrics.precision_score(y_true, y_pred, average='weighted')
recall_score_dt = metrics.recall_score(y_true, y_pred, average='weighted')
accuracy_score_dt = metrics.accuracy_score(y_true, y_pred)
f1_score_dt = metrics.f1_score(y_true, y_pred, average='weighted')

print("决策树模型性能评估：")
print("精确率:", precision_score_dt)
print("召回率:", recall_score_dt)
print("准确率:", accuracy_score_dt)
print("F1分数:", f1_score_dt)
print(f"决策树训练时间：{duration_dt:.3f}秒")

+-----+--------------------+-------+--------------------+--------------------+----------+
|label|            features|indexed|       rawPrediction|         probability|prediction|
+-----+--------------------+-------+--------------------+--------------------+----------+
|SEKER|[30477.0,670.033,...|    2.0|[0.0,0.0,37.0,0.0...|[0.0,0.0,1.0,0.0,...|       2.0|
|SEKER|[31675.0,657.431,...|    2.0|[1846.0,22.0,16.0...|[0.97827239003709...|       0.0|
|SEKER|[31811.0,642.092,...|    2.0|[1846.0,22.0,16.0...|[0.97827239003709...|       0.0|
|SEKER|[31823.0,662.532,...|    2.0|[1846.0,22.0,16.0...|[0.97827239003709...|       0.0|
|SEKER|[31992.0,640.338,...|    2.0|[16.0,8.0,1190.0,...|[0.01313628899835...|       2.0|
+-----+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows

决策树模型性能评估：
精确率: 0.8991869757222919
召回率: 0.8966363859562976
准确率: 0.8966363859562976
F1分数: 0.8970322088635054
决策树训练时间：6.474秒
