In [55]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import os

spark = SparkSession.builder.appName('heart_detection').getOrCreate()
df = spark.read.csv('heartrate_seconds_merged_3.12.16-4.11.16.csv', header=True, inferSchema=True)


In [56]:
from pyspark.sql.functions import to_date, to_timestamp
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [57]:
df = df.withColumn("datetime_column", to_timestamp(df["Time"], "MM/dd/yyyy hh:mm:ss a"))

In [58]:
df = df.select(
    'Id',
    'datetime_column',
    'Value'
)
df = df.withColumnRenamed("Value", "Heartrate")

In [59]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Tính toán sự thay đổi giữa các giá trị nhịp tim liên tiếp
windowSpec = Window.partitionBy("Id").orderBy("datetime_column")
df = df.withColumn("prev_heartrate", F.lag("Heartrate", 1).over(windowSpec))
df = df.withColumn("heartrate_change", F.col("Heartrate") - F.col("prev_heartrate"))

In [61]:
df = df.withColumn(
    "label", 
    F.when(
        F.abs(F.col("heartrate_change")) > 10, 0.5
    ).when(
        (F.col("Heartrate") > 100) | (F.col("Heartrate") < 60), 0
    ).otherwise(1)
)


In [62]:
df.show(5)

+----------+-------------------+---------+--------------+----------------+-----+
|        Id|    datetime_column|Heartrate|prev_heartrate|heartrate_change|label|
+----------+-------------------+---------+--------------+----------------+-----+
|2026352035|2016-04-02 00:08:20|       62|          NULL|            NULL|  1.0|
|2026352035|2016-04-02 00:08:35|       62|            62|               0|  1.0|
|2026352035|2016-04-02 00:08:50|       61|            62|              -1|  1.0|
|2026352035|2016-04-02 00:09:00|       62|            61|               1|  1.0|
|2026352035|2016-04-02 00:09:15|       62|            62|               0|  1.0|
+----------+-------------------+---------+--------------+----------------+-----+
only showing top 5 rows



In [63]:
from pyspark.sql.functions import col, to_timestamp, collect_list, avg, when, lit

# Chuyển đổi cột 'datetime_column' thành kiểu timestamp thay vì chỉ date
grouped_df = df.withColumn('timestamp', to_timestamp(col('datetime_column'))) \
    .withColumn('date', col('timestamp').cast('date'))  # Tạo cột 'date' chỉ chứa ngày

# Gom nhóm theo ngày
grouped_df = grouped_df.groupBy('Id','date').agg(
    collect_list("Heartrate").alias("heartrate_list"),
    collect_list("prev_heartrate").alias("prev_heartrate_list"),
    collect_list("heartrate_change").alias("heartrate_change_list"),
    avg("label").alias("mean_label")  # Tính trung bình trên cột label
)

# Tạo cột 'final_label' dựa trên giá trị trung bình của 'label'
grouped_df = grouped_df.withColumn(
    "final_label", 
    when(col("mean_label") > 0.8, lit(2))
        .when(col("mean_label") < 0.5, lit(0))
        .otherwise(lit(1))
)

In [64]:
# Bước 2: Hiển thị kết quả
grouped_df.select("Id","date", "heartrate_list", "final_label").orderBy('date').show()

+----------+----------+--------------------+-----------+
|        Id|      date|      heartrate_list|final_label|
+----------+----------+--------------------+-----------+
|2347167796|2016-03-29|[69, 68, 69, 69, ...|          2|
|6962181067|2016-03-30|[60, 60, 59, 59, ...|          2|
|2347167796|2016-03-30|[58, 59, 60, 60, ...|          1|
|6962181067|2016-03-31|[63, 62, 62, 63, ...|          1|
|2347167796|2016-03-31|[81, 79, 80, 79, ...|          2|
|4020332650|2016-04-01|[71, 71, 71, 71, ...|          1|
|5553957443|2016-04-01|[68, 66, 67, 66, ...|          2|
|2347167796|2016-04-01|[65, 65, 62, 65, ...|          2|
|4558609924|2016-04-01|[67, 70, 70, 68, ...|          2|
|5577150313|2016-04-01|[55, 55, 55, 57, ...|          0|
|8792009665|2016-04-01|[81, 81, 81, 81, ...|          2|
|2022484408|2016-04-01|[93, 91, 96, 98, ...|          1|
|8877689391|2016-04-01|[74, 70, 71, 79, ...|          1|
|6775888955|2016-04-01|[121, 120, 122, 1...|          0|
|6117666160|2016-04-01|[74, 74,

In [65]:
from pyspark.sql.functions import col, size, expr

# Tính overmin: Phần trăm giá trị nhỏ hơn 60
grouped_df = grouped_df.withColumn(
    "overmin",
    size(expr("filter(heartrate_list, x -> x < 60)")) / size(col("heartrate_list"))
)

# Tính overmax: Phần trăm giá trị lớn hơn 100
grouped_df = grouped_df.withColumn(
    "overmax",
    size(expr("filter(heartrate_list, x -> x > 100)")) / size(col("heartrate_list"))
)

# Tính overchange: Phần trăm giá trị lớn hơn 10 hoặc nhỏ hơn -10
grouped_df = grouped_df.withColumn(
    "overchange",
    size(expr("filter(heartrate_change_list, x -> x > 10 OR x < -10)")) / size(col("heartrate_change_list"))
)

grouped_df = grouped_df.select("overchange", "overmax", "overmin", "final_label")
grouped_df.show(5)


+--------------------+-------------------+--------------------+-----------+
|          overchange|            overmax|             overmin|final_label|
+--------------------+-------------------+--------------------+-----------+
|                 0.0|                0.0| 0.04100227790432802|          2|
| 0.00195031820981318|0.12460227855896541|0.014574566355332033|          2|
|0.002457002457002457|                0.0| 0.19926289926289925|          1|
|0.002334267040149393|                0.0| 0.42250233426704015|          1|
|0.001195652173913...| 0.0932608695652174| 0.19956521739130434|          1|
+--------------------+-------------------+--------------------+-----------+
only showing top 5 rows



In [66]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

assembler = VectorAssembler(inputCols=["overchange", "overmax", "overmin"], outputCol="features")
grouped_df = assembler.transform(grouped_df)

In [67]:
# 3. Khởi tạo mô hình RandomForest
rf = RandomForestClassifier(featuresCol="features", labelCol="final_label", numTrees=50)
# lr = LogisticRegression(featuresCol="features", labelCol="final_label")
# Huấn luyện mô hình
model = rf.fit(grouped_df)
# model = lr.fit(grouped_df_sampled)

In [68]:
# 4. Dự đoán trên dữ liệu kiểm tra (giả sử test_data có cùng cấu trúc với grouped_df)
predictions = model.transform(grouped_df)

# 5. Đánh giá mô hình
evaluator = MulticlassClassificationEvaluator(labelCol="final_label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.986013986013986
