In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
import scipy.stats as stats
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score
import warnings  
import math

# Ignore all warnings  
warnings.filterwarnings('ignore')

sns.set(style="whitegrid")
%matplotlib inline

📌
Before diving into analysis, it’s essential to understand the structure, shape, and quality of the data.

In [27]:
# Load the dataset 
#df = pd.read_csv('../../../raw/student_depression_dataset.csv')
import os

# Get the project root based on the notebook's location
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../../'))   # go up to project root

# Create a path to your dataset
csv_path = os.path.join(project_root, 'Data', 'raw', 'student_depression_dataset.csv')

# Load it

df = pd.read_csv(csv_path)

try:
    df = pd.read_csv(csv_path)
    df_copy = df.copy()
except FileNotFoundError:
    print("CSV not found! Check path.")

In [None]:
df_copy.head()# print first 5 rows of the dataset
    

In [29]:
df_copy.shape

(27901, 18)

In [None]:
df_copy.info()

💬 Interpretation:

The dataset consists of various features such as Age, Gender, Academic Pressure, Study Satisfaction, Financial Stress, etc.

The goal is to prepare and clean these fields for meaningful analysis.

In [31]:
# Convert object columns to category
for col in df_copy.select_dtypes(include='object').columns:
    if col != 'Financial Stress':  # Exclude Financial Stress from category conversion
        df_copy[col] = df_copy[col].astype('category')

# Convert Financial Stress to float
df_copy['Financial Stress'] = pd.to_numeric(df_copy['Financial Stress'], errors='coerce')


In [None]:
(df_copy == 0).sum() 

In [None]:
for col in df_copy.columns:
    print(f"{col}: {df_copy[col].unique()}")

In [34]:
df_copy['Sleep Duration'] = df_copy['Sleep Duration'].replace('Others', np.nan)
df_copy['Financial Stress'] = df_copy['Financial Stress'].replace('?', np.nan)
df_copy['Dietary Habits'] = df_copy['Dietary Habits'].replace('Others', np.nan)

In [None]:
df.isnull().sum()

In [36]:
# Transforming the Financial Stress column to numeric and replacing null values with the median
df_copy['Financial Stress'] = pd.to_numeric(df_copy['Financial Stress'], errors='coerce')
df_copy['Financial Stress'] = df_copy['Financial Stress'].fillna(df_copy['Financial Stress'].median())


# Creating lists with categorical variables and numeric variables according to their DTYPE
categorical_col = df_copy.select_dtypes(include=['object']).columns.to_list()
numerical_col = df_copy.select_dtypes(include=['int64','float64']).columns.to_list()

# Excluding the 'Id' column
numerical_col = [col for col in numerical_col if col not in ['id']]

In [37]:
# Transform the "Yes/No" values in "1/0" values
binary_map = {'Yes' : 1,'No' : 0}
df_copy['Have you ever had suicidal thoughts ?'] = df_copy['Have you ever had suicidal thoughts ?'].map(binary_map)
df_copy['Family History of Mental Illness']= df_copy['Family History of Mental Illness'].map(binary_map)

In [None]:
df_copy.info()

In [None]:
# To see statistics for all columns, including object (categorical) columns, use:
df_copy.describe(include='all')


In [None]:


def value_counts_pretty(series, column_name="Value"):
    value_counts = series.value_counts(dropna=False)
    percentages = series.value_counts(normalize=True, dropna=False) * 100

    table = PrettyTable()
    table.field_names = [column_name, "Count", "Percentage"]
    table.align[column_name] = "c"
    table.align["Count"] = "r"
    table.align["Percentage"] = "r"

    for value, count in value_counts.items():
        display_value = "NaN/Missing" if pd.isna(value) else str(value)
        pct = percentages[value]
        table.add_row([display_value, count, f"{pct:.2f}%"])
    return table

colsJob = ['Work Pressure', 'Job Satisfaction','Gender' ,'Age' ,'City' ,'Profession' ,'Academic Pressure' ,'Work Pressure' ,
           'CGPA' ,'Job Satisfaction' ,'Sleep Duration' ,'Degree','Dietary Habits' ,'Study Satisfaction' ,'Have you ever had suicidal thoughts ?', 
           'Work/Study Hours','Financial Stress','Family History of Mental Illness','Depression'   ]                            

for col in colsJob:
    print(f"\n Value counts for '{col}':")
    print(value_counts_pretty(df_copy[col], col))

In [None]:
def value_counts_pretty(series, column_name="Value", threshold_percentage=90):
    value_counts = series.value_counts(dropna=False)
    percentages = series.value_counts(normalize=True, dropna=False) * 100

    # Check if any percentage meets the threshold
    if not (percentages >= threshold_percentage).any():
        return None # Return None if no value meets the threshold

    table = PrettyTable()
    table.field_names = [column_name, "Count", "Percentage"]
    table.align[column_name] = "c"
    table.align["Count"] = "r"
    table.align["Percentage"] = "r"

    for value, count in value_counts.items():
        display_value = "NaN/Missing" if pd.isna(value) else str(value)
        pct = percentages[value]
        table.add_row([display_value, count, f"{pct:.2f}%"])
    return table

colsJob = ['Work Pressure', 'Job Satisfaction','Gender' ,'Age' ,'City' ,'Profession' ,'Academic Pressure' ,
           'CGPA' ,'Sleep Duration' ,'Dietary Habits' ,'Study Satisfaction' ,'Have you ever had suicidal thoughts ?',
           'Work/Study Hours','Financial Stress','Family History of Mental Illness','Depression']

for col in colsJob:
    table_output = value_counts_pretty(df_copy[col], col)
    if table_output: # Only print if the table is not None (i.e., meets the threshold)
        print(f"\n Value counts for '{col}':")
        print(table_output)

🔍 Key Insights 
1. Work Pressure
Observation: 99.99% report 0.0; very low variance.

2. Job Satisfaction
Observation: 99.97% report 0.0; very skewed.

3. Gender
Observation: 'Male' is the dominant category.


In [None]:

# Drop NaNs from numerical columns
plot_data = df_copy[numerical_col].dropna()

# Set layout: 2 plots per row
ncols = 2
nrows = math.ceil(len(numerical_col) / ncols)

# Create subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, 5 * nrows))
axes = axes.flatten()

colors = ['skyblue', 'salmon']

# Plot each feature
for i, feature in enumerate(numerical_col):
    color = colors[i % len(colors)]
    sns.histplot(data=plot_data, x=feature, bins=25, kde=True, ax=axes[i], color=color)
    axes[i].set_title(f'Distribution of {feature}', fontsize=14)
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True)

# Hide unused subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

# Title and layout
plt.suptitle("Distribution of Numerical Features", fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


1. Work Pressure
Insight: Extremely right-skewed; almost all values are 0.0, with very few > 0.
Not likely useful in numeric form.

2. Job Satisfaction
Insight: Almost everyone reports 0.0, indicating extreme class imbalance.


In [None]:
categorical_col = df_copy.select_dtypes(include='category').columns.tolist()

n_cols = 2
n_rows = math.ceil(len(categorical_col) / n_cols)

plt.figure(figsize=(14, 5 * n_rows))

for i, col in enumerate(categorical_col, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.countplot(data=df, x=col, color='skyblue')
    plt.title(f'{col} Distribution')
    plt.xticks(rotation=45)
    plt.tight_layout()

plt.show()


1. Profession
Strong class imbalance → reduce to “Student” vs “Other”.
Or group rare professions into "Other" category.

2. Dietary Habits
Insight: Majority are moderate or unhealthy.
Might correlate with depression or stress levels.

3. Degree
Insight: Very high cardinality; lots of unique degree types with small counts.Reduce to broader categories (e.g., “Undergraduate”, “Postgraduate”).


4. Suicidal Thoughts
Insight: About 65% said 'Yes' to having had suicidal thoughts.
Strong mental health indicator → important feature.

5. Family History of Mental Illness
Insight: Roughly balanced between Yes (1) and No (0).
Useful feature; keep as binary (0/1).


In [None]:
# Get value counts for all cities
city_counts = df_copy['City'].value_counts()

# Plot
plt.figure(figsize=(18, 8))  # Adjust width to fit all city labels
sns.barplot(x=city_counts.index, y=city_counts.values, palette='viridis')

plt.title("City-wise Distribution (All Cities)", fontsize=16)
plt.xlabel("City", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=90)  # Rotate for readability
plt.tight_layout()
plt.show()

City Distribution Insights

High Cardinality: The dataset contains many unique city names, which can cause sparse data issues and potential overfitting in models if used directly.

Top Cities: Indore, Surat, Hyderabad, and Pune have the highest respondent counts, each exceeding 1000 entries.

Data Quality Issues: Several city entries appear to be errors or mislabels (e.g., 'Less Delhi', '3.0', 'City', 'M.Com'), indicating data cleaning is needed

In [None]:


for feature in numerical_col:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='Depression', y=feature, data=df)
    plt.title(f'{feature} vs Depression')
    plt.xlabel('Depression Level')
    plt.ylabel(feature)
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:

depression_levels = df_copy['Depression'].unique()

for feature in numerical_col:
    g = sns.displot(data=df_copy, x=feature, col='Depression', col_wrap=4, bins=10, kde=False, height=4, aspect=1.2)
    g.fig.suptitle(f'{feature} Histogram by Depression Level', y=1.05)
    plt.tight_layout()
    plt.show()


In [None]:


for feature in categorical_col:
    try:
        g = sns.catplot(data=df_copy, x=feature, kind='count', hue='Depression',
                        height=4, aspect=1.5, palette='Set2')
        g.fig.suptitle(f'{feature} Distribution by Depression Level', y=1.03)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Could not plot {feature}: {e}")


In [None]:

def crosstab_pretty(df_copy, row_col, col_col, normalize='all'):
    """
    Display crosstab of two categorical columns with counts and percentages.
    
    Args:
        df (DataFrame): The dataset.
        row_col (str): First categorical column (e.g., Profession).
        col_col (str): Second categorical column (e.g., Degree).
        normalize (str): 'all' (default) for % of total, 'index' for row-wise %, 'columns' for col-wise %.
    """
    count_table = pd.crosstab(df_copy[row_col], df_copy[col_col])
    
    if normalize == 'index':
        pct_table = pd.crosstab(df_copy[row_col], df_copy[col_col], normalize='index') * 100
    elif normalize == 'columns':
        pct_table = pd.crosstab(df_copy[row_col], df_copy[col_col], normalize='columns') * 100
    else:
        pct_table = pd.crosstab(df_copy[row_col], df_copy[col_col], normalize='all') * 100

    # Create PrettyTable
    table = PrettyTable()
    table.field_names = [row_col, col_col, "Count", "Percentage"]
    table.align[row_col] = "l"
    table.align[col_col] = "l"
    table.align["Count"] = "r"
    table.align["Percentage"] = "r"

    for r in count_table.index:
        for c in count_table.columns:
            count = count_table.loc[r, c]
            pct = pct_table.loc[r, c]
            table.add_row([r, c, count, f"{pct:.2f}%"])

    return table

print(crosstab_pretty(df, 'Profession', 'Degree', normalize='index'))  # Row-wise %


### Degree Distribution Insights

- The distribution of degree types is **highly skewed**:
  - The top 5 degrees (`Class 12`, `B.Ed`, `B.Com`, `B.Arch`, `BCA`) represent approximately **44%** of the dataset.
  - The bottom 10 degrees account for **less than 15%**, with several appearing in **less than 1%** of records.

- This reflects a **long-tail distribution**, where many degrees are rare and sparsely represented.

- Including all degree categories as features (e.g., via one-hot encoding) would introduce **high dimensionality and sparsity**, which can degrade performance for models like linear regression or logistic regression.

- **Recommendation**: Group rare degrees into an `"Other Degree"` category to simplify modeling and reduce noise.


### Feature Elimination Consideration

Based on the current data:

- **City**: Contains many invalid or inconsistent entries, making it an unreliable feature. It is likely a candidate for removal.
- **Profession**: Over 99% of respondents are labeled as "Student", offering little to no variance. This feature may not contribute meaningfully to model training.
- **Work_Pressure**: The majority of entries indicate "No work pressure" (~99%), leading to severe class imbalance. It may be reasonable to drop this feature.
- **Job_Satisfaction**: Similarly, around 99% of entries show "No satisfaction", indicating minimal variation and potential redundancy.

These features can be considered for exclusion during feature selection or model optimization.
