<center>
<h1 style="font-weight:bold;color:#a64d79">Student mental health : </br>Stress, factors and coping strategies</h1>
<p>Hadoop Project | Group 11 - 3IABD2</p>
</center>

<h2 style="font-weight:bold;color:#c27ba0">Objectives</h2>

<p>This project aims to identify the causes of stress and coping strategies through the analysis of large-scale data, using Big Data tools and machine learning techniques.</p>

<hr style="border: none; height: 0.5px; background-color: #aaa;" />

In [106]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, stddev, min, max, col, when, count
from pyspark.sql.types import IntegerType

<h2 style="font-weight:bold;color:#c27ba0">Dataframe creation</h2>

In [107]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("SMHAnalysis") \
    .getOrCreate()

In [108]:
# Read CSV files from HDFS and create DataFrames
df_sample = spark.read.option("header", True) \
    .option("inferSchema", True) \
    .csv("hdfs://localhost:9000/data_input/dataset-1.csv")

df_esgi = spark.read.option("header", True) \
    .option("inferSchema", True) \
    .csv("hdfs://localhost:9000/data_input/dataset-2.csv")

# Cleaning column names
df_sample = df_sample.toDF(*[c.strip() for c in df_sample.columns])
df_esgi = df_esgi.toDF(*[c.strip() for c in df_esgi.columns])


df_all = df_sample.unionByName(df_esgi)

# Display the first 5 rows of sample dataset
print("------------------------------------------------------------------------")
print("Preview of the first few rows :")
print("------------------------------------------------------------------------\n")
df_all.show(5)

------------------------------------------------------------------------
Preview of the first few rows :
------------------------------------------------------------------------

+-----------+---+------+--------------------------+--------------------+----------------------------------+--------------------------------+----------------------------------+--------------+----------------+-------------+-------------------+-------------------+---------------------+------------+------------------------+---------------------+----------------------------+-----------------+-------------+
| Student ID|Age|Gender|Academic Performance (GPA)|Study Hours Per Week|Social Media Usage (Hours per day)|Sleep Duration (Hours per night)|Physical Exercise (Hours per week)|Family Support|Financial Stress|Peer Pressure|Relationship Stress|Mental Stress Level|Counseling Attendance|Diet Quality|Stress Coping Mechanisms|Cognitive Distortions|Family Mental Health History|Medical Condition|Substance Use|
+----------

<h2 style="font-weight:bold;color:#c27ba0">Preprocessing and Exploratory Data Analysis</h2>

In [109]:
# Dimensions of the dataset
print("------------------------------------------------------------------------")
print("Dimensions of dataframes :")
print("------------------------------------------------------------------------\n")

print("Rows :")
print(f"- Sample dataset : {df_sample.count()} rows")
print(f"- ESGI dataset : {df_esgi.count()} rows")
print(f"- TOTAL ROWS : {df_all.count()} rows")

print(f"\nColumns : ")
print(f"- TOTAL COLUMNS : {len(df_all.columns)} columns")

------------------------------------------------------------------------
Dimensions of dataframes :
------------------------------------------------------------------------

Rows :
- Sample dataset : 760 rows
- ESGI dataset : 174 rows
- TOTAL ROWS : 934 rows

Columns : 
- TOTAL COLUMNS : 20 columns


In [110]:
# Columns details
print("-----------------------------------------------------------------------------------")
print("Columns details :")
print("-----------------------------------------------------------------------------------\n")

print(f"{'NAME':<40} {'TYPE':<10} {'NULL VALUES':<15} {'NON-NULL VALUES':<10}")

for column, dtype in df_all.dtypes:
    null_values = df_all.filter(col(column).isNull()).count()
    non_null_values = df_all.filter(col(column).isNotNull()).count()
    print(f"{column:<40} {dtype:15} {null_values:<15} {non_null_values:<10}")

-----------------------------------------------------------------------------------
Columns details :
-----------------------------------------------------------------------------------

NAME                                     TYPE       NULL VALUES     NON-NULL VALUES
Student ID                               string          0               934       
Age                                      int             0               934       
Gender                                   string          0               934       
Academic Performance (GPA)               int             0               934       
Study Hours Per Week                     int             0               934       
Social Media Usage (Hours per day)       int             0               934       
Sleep Duration (Hours per night)         int             0               934       
Physical Exercise (Hours per week)       int             0               934       
Family Support                           int             

In [111]:

# Delete Student ID column
df_sample = df_sample.drop("Student Id")
df_esgi = df_esgi.drop("Student Id")
df_all = df_all.drop("Student Id")

In [115]:

# Converting string columns to numeric types 

# Columns with YES/NO values
def yes_no_to_int(df, columns):
    for column in columns:
        if type(df.schema[column].dataType) != IntegerType:
            df = df.withColumn(
                column,
                when(col(column) == "Yes", 1)
                .when(col(column) == "No", 0)
                .cast(IntegerType())
            )
    return df


# Columns with others values
def other_value_to_int(df, columns):
    for column in columns:
        if type(df.schema[column].dataType) != IntegerType:
            df = df.withColumn(
                column,
                when(col(column) == "Male", 1)
                .when(col(column) == "Female", 0)
                .otherwise(2)
            .cast(IntegerType())
        )
    return df


# Counselling, Family Mental Health History, Mental Condition | YES(1), NO(0)
string_columns = ["Counseling Attendance", "Family Mental Health History", "Medical Condition"]

df_sample = yes_no_to_int(df_sample, string_columns)
df_esgi = yes_no_to_int(df_esgi, string_columns)
df_all = yes_no_to_int(df_all, string_columns)


# Gender | MALE(0), FEMALE(1), OTHER(2)

df_sample = other_value_to_int(df_sample, ["Gender"])
df_esgi = other_value_to_int(df_esgi, ["Gender"])
df_all = other_value_to_int(df_all, ["Gender"])

# Affichage nulls / non-nulls pour chaque colonne


#for column, dtype in df_all.dtypes:
#    print(f"{column:<40} {dtype:15}")

df_all.head(5)


[Row(Age=22, Gender=0, Academic Performance (GPA)=2, Study Hours Per Week=9, Social Media Usage (Hours per day)=2, Sleep Duration (Hours per night)=12, Physical Exercise (Hours per week)=2, Family Support=1, Financial Stress=1, Peer Pressure=3, Relationship Stress=5, Mental Stress Level=9, Counseling Attendance=0, Diet Quality=1, Stress Coping Mechanisms='Walking or Nature Walks', Cognitive Distortions=4, Family Mental Health History=0, Medical Condition=1, Substance Use=1),
 Row(Age=25, Gender=0, Academic Performance (GPA)=0, Study Hours Per Week=28, Social Media Usage (Hours per day)=0, Sleep Duration (Hours per night)=6, Physical Exercise (Hours per week)=0, Family Support=1, Financial Stress=1, Peer Pressure=1, Relationship Stress=2, Mental Stress Level=9, Counseling Attendance=1, Diet Quality=3, Stress Coping Mechanisms='Meditation', Cognitive Distortions=2, Family Mental Health History=1, Medical Condition=0, Substance Use=1),
 Row(Age=24, Gender=0, Academic Performance (GPA)=0, 

In [None]:

# Converting string columns to numeric types
for column in df_all.columns:
    if df_all.schema[column].dataType == "string":
        try:
            df_all = df_all.withColumn(column, col(column).cast("double"))
        except Exception as e:
            print(f"Error converting column {column}: {e}")


# gender                    male(0), female(1), other (2)
# conselling                yes(1), no(0)
# stress coping mechanisms   exercise(0), meditation(1), therapy(2), none(3) afficher tout
# family mental health history yes(1), no(0)
# mental condition           yes(1), no(0)

print(f"\nColumns : ")
print(f"- TOTAL COLUMNS : {len(df_all.columns)} columns")

In [None]:

df_all.select(
    avg("Mental Stress Level").alias("Moyenne Stress"),
    stddev("Mental Stress Level").alias("Écart-type Stress"),
    min("Mental Stress Level").alias("Min Stress"),
    max("Mental Stress Level").alias("Max Stress"),
    avg("Sleep Duration (Hours per night)").alias("Moyenne Sommeil"),
    avg("Social Media Usage (Hours per day)").alias("Moyenne Réseaux"),
    avg("Academic Performance (GPA)").alias("Moyenne GPA")
).show()

df_all.groupBy("Gender").count().show()
print("Corrélation Stress ↔ Sommeil :", df_all.stat.corr("Mental Stress Level", "Sleep Duration (Hours per night)"))
print("Corrélation Stress ↔ Réseaux Sociaux :", df_all.stat.corr("Mental Stress Level", "Social Media Usage (Hours per day)"))
print("Corrélation GPA ↔ Sommeil :", df_all.stat.corr("Academic Performance (GPA)", "Sleep Duration (Hours per night)"))
df_all.groupBy("Gender").agg(avg("Mental Stress Level").alias("Stress moyen")).show()

+-----------------+------------------+----------+----------+-----------------+------------------+-----------------+
|   Moyenne Stress| Écart-type Stress|Min Stress|Max Stress|  Moyenne Sommeil|   Moyenne Réseaux|      Moyenne GPA|
+-----------------+------------------+----------+----------+-----------------+------------------+-----------------+
|5.535331905781584|2.8254573176226927|         1|        10|8.028907922912206|4.1713062098501075|2.032119914346895|
+-----------------+------------------+----------+----------+-----------------+------------------+-----------------+

+-----------+-----+
|     Gender|count|
+-----------+-----+
|Genderqueer|    7|
|    Agender|   14|
|     Female|  407|
| Polygender|   12|
|   Bigender|   16|
| Non-binary|   10|
|       Male|  430|
|Genderfluid|   12|
|      Autre|   26|
+-----------+-----+

Corrélation Stress ↔ Sommeil : 0.029504724894151107
Corrélation Stress ↔ Réseaux Sociaux : 0.01628495726134644
Corrélation GPA ↔ Sommeil : -0.0116906639137663

In [None]:
pdf = df.toPandas()


In [None]:
print("Statistiques descriptives des variables numériques")
print(pdf.describe())

In [None]:
print("Valeurs uniques par colonne catégorielle")
cat_cols = pdf.select_dtypes(include="object").columns

for col in cat_cols:
    print(f"- {col} ({pdf[col].nunique()} valeurs uniques) : {pdf[col].unique()[:5]}")

In [None]:
print("Distribution de 'Mental Stress Level'")
print(pdf["Mental Stress Level"].value_counts().sort_index())

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=pdf, x="Mental Stress Level", palette="coolwarm")
plt.title("Distribution du niveau de stress mental")
plt.xlabel("Score de stress (1 à 10)")
plt.ylabel("Nombre d'étudiants")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=pdf, x="Gender", order=pdf["Gender"].value_counts().index)
plt.title("Répartition des genres")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot(data=pdf, x="Family Support", y="Mental Stress Level", color="mediumseagreen")
sns.regplot(data=pdf, x="Family Support", y="Mental Stress Level", scatter=False, color="darkgreen")
plt.title("Soutien familial vs Stress mental")
plt.xlabel("Soutien familial (1 = faible, 5 = fort)")
plt.ylabel("Niveau de stress mental")
plt.tight_layout()
plt.show()

correlation = pdf["Family Support"].corr(pdf["Mental Stress Level"])
print(f"Corrélation entre le soutien familial et le stress : {correlation:.2f}")

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot(
    data=pdf,
    x="Sleep Duration (Hours per night)",
    y="Academic Performance (GPA)",
    color="mediumseagreen"
)
sns.regplot(
    data=pdf,
    x="Sleep Duration (Hours per night)",
    y="Academic Performance (GPA)",
    scatter=False,
    color="darkgreen"
)
plt.title("Sommeil vs Performance académique (GPA)")
plt.xlabel("Durée de sommeil (heures/nuit)")
plt.ylabel("GPA (Performance académique)")
plt.tight_layout()
plt.show()

correlation = pdf["Sleep Duration (Hours per night)"].corr(pdf["Academic Performance (GPA)"])
print(f"Corrélation Sommeil ↔ GPA : {correlation:.2f}")

In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot(
    data=pdf,
    x="Physical Exercise (Hours per week)",
    y="Mental Stress Level",
    color="mediumseagreen"
)
sns.regplot(
    data=pdf,
    x="Physical Exercise (Hours per week)",
    y="Mental Stress Level",
    scatter=False,
    color="darkgreen"
)
plt.title("Activité physique vs Stress mental")
plt.xlabel("Activité physique (heures/semaine)")
plt.ylabel("Niveau de stress mental")
plt.tight_layout()
plt.show()

correlation = pdf["Physical Exercise (Hours per week)"].corr(pdf["Mental Stress Level"])
print(f"Corrélation Activité physique ↔ Stress mental : {correlation:.2f}")


In [None]:
plt.figure(figsize=(7, 5))
sns.scatterplot(
    data=pdf,
    x="Social Media Usage (Hours per day)",
    y="Mental Stress Level",
    color="mediumseagreen"
)
sns.regplot(
    data=pdf,
    x="Social Media Usage (Hours per day)",
    y="Mental Stress Level",
    scatter=False,
    color="darkgreen"
)
plt.title("Temps passé sur les réseaux sociaux vs Stress mental")
plt.xlabel("Usage des réseaux (heures/jour)")
plt.ylabel("Niveau de stress mental")
plt.tight_layout()
plt.show()

correlation = pdf["Social Media Usage (Hours per day)"].corr(pdf["Mental Stress Level"])
print(f"Corrélation Réseaux Sociaux ↔ Stress mental : {correlation:.2f}")

In [None]:
correlation_matrix = pdf.corr(numeric_only=True)

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Matrice de corrélation entre variables")
plt.tight_layout()
plt.show()

In [None]:
cor_stress = correlation_matrix["Mental Stress Level"].drop("Mental Stress Level")

plt.figure(figsize=(8, 5))
cor_stress.sort_values().plot(kind='barh', color='mediumseagreen')
plt.title("Corrélation avec le stress mental")
plt.xlabel("Coefficient de corrélation")
plt.tight_layout()
plt.show()

In [None]:
coping_table = pd.crosstab(pdf["Stress Coping Mechanisms"], pdf["Gender"])
print(coping_table)

strategies = coping_table.index.tolist()
x = np.arange(len(strategies))
width = 0.35

plt.figure(figsize=(12, 6))
plt.bar(x - width/2, coping_table["Female"], width, label='Femmes', color='mediumseagreen')
plt.bar(x + width/2, coping_table["Male"], width, label='Hommes', color='darkgreen')

plt.xticks(x, strategies, rotation=45, ha='right')
plt.title("Stratégies de coping par genre")
plt.xlabel("Stratégie de coping")
plt.ylabel("Nombre de participants")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
correlation = pdf["Sleep Duration (Hours per night)"].corr(pdf["Academic Performance (GPA)"])
print(f"Corrélation Sommeil ↔ GPA : {correlation:.2f}")

plt.figure(figsize=(7, 5))
sns.scatterplot(
    data=pdf,
    x="Sleep Duration (Hours per night)",
    y="Academic Performance (GPA)",
    color="mediumseagreen"
)
sns.regplot(
    data=pdf,
    x="Sleep Duration (Hours per night)",
    y="Academic Performance (GPA)",
    scatter=False,
    color="darkgreen"
)
plt.title("Sommeil vs Performance académique (GPA)")
plt.xlabel("Durée de sommeil (heures/nuit)")
plt.ylabel("GPA (Performance académique)")
plt.tight_layout()
plt.show()