In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import time

In [3]:
# Crea una sessione Spark
spark = SparkSession.builder.appName("StudentDepressionRandomForest").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/22 15:40:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Carica il dataset da HDFS (assumendo che sia stato caricato correttamente)
df = spark.read.csv("hdfs://instance-20250422-084356:9000/user/hadoop/EDA_Student_Depression_Dataset.csv", header=True, inferSchema=True)

25/04/22 15:40:51 WARN FileSystem: "instance-20250422-084356:9000" is a deprecated filesystem name. Use "hdfs://instance-20250422-084356:9000/" instead.
                                                                                

In [5]:
# Visualizza i dati per confermare il caricamento
print(df.head())

Row(Gender=1, Age=33.0, Academic_Pressure=5.0, CGPA=8.97, Study_Satisfaction=2.0, Sleep_Duration=0, Have_you_ever_had_suicidal_thoughts?=1, Work/Study_Hours=3.0, Financial_Stress=1.0, Family_History_of_Mental_Illness=0, Depression=1, Is_Metropolis=0, Dietary_Habits_Healthy=1, Dietary_Habits_Moderate=0, Dietary_Habits_Unhealthy=0, Degree_bachelor=1, Degree_high_school=0, Degree_masters_phd=0)


In [6]:
# Preprocessing: Creazione della feature "features" combinando tutte le colonne numeriche
columns = df.columns
columns.remove('Depression')  # Rimuoviamo la colonna target

In [7]:
# Creazione del VectorAssembler per combinare tutte le features in un'unica colonna
assembler = VectorAssembler(inputCols=columns, outputCol="features")
scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")

In [8]:
# Crea il modello di Random Forest
rf = RandomForestClassifier(labelCol="Depression", featuresCol="features_scaled", numTrees=10, maxDepth=5)

# Costruisci il pipeline
pipeline = Pipeline(stages=[assembler, scaler, rf])

# Splitta i dati in training e test (80% training, 20% test)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=1234)

In [9]:
# Inizio della misurazione del tempo di addestramento
start_time = time.time()

# Addestra il modello
pipeline_model = pipeline.fit(train_data)

# Fine della misurazione del tempo di addestramento
end_time = time.time()

25/04/22 15:41:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
# Calcola e stampa il tempo di addestramento
training_time = end_time - start_time
print(f"Tempo di addestramento: {training_time:.2f} secondi")

Tempo di addestramento: 8.74 secondi


In [11]:
# Fai le predizioni sul set di test
predictions = pipeline_model.transform(test_data)

# Visualizza le prime 10 predizioni
predictions.select("prediction", "Depression").show(10)

+----------+----------+
|prediction|Depression|
+----------+----------+
|       1.0|         0|
|       0.0|         0|
|       0.0|         0|
|       0.0|         0|
|       1.0|         0|
|       0.0|         0|
|       1.0|         1|
|       0.0|         0|
|       0.0|         0|
|       0.0|         0|
+----------+----------+
only showing top 10 rows



In [12]:
# Valutazione del modello utilizzando l'AUC
evaluator = BinaryClassificationEvaluator(labelCol="Depression")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

# Calcolare altre metriche come l'accuratezza
t_test = predictions.select('Depression').toPandas()
t_hat = predictions.select('prediction').toPandas()
print(f"\nAccuracy score on the test set: {accuracy_score(t_test, t_hat)}")
print(f"\nClassification Report:\n{classification_report(t_test, t_hat)}")

AUC: 0.9039813955598632

Accuracy score on the test set: 0.8406504065040651

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.77      0.80      2264
           1       0.85      0.89      0.87      3271

    accuracy                           0.84      5535
   macro avg       0.84      0.83      0.83      5535
weighted avg       0.84      0.84      0.84      5535

