In [None]:
#IMPORT LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
mental_data = pd.read_csv('Mental Health Dataset.csv')

df = mental_data.copy()  # make a copy of the dataset
df.head()

In [None]:
df.info()

# DATA CLEANING

In [None]:
## missing values
df[df.isnull().any(axis=1)]

In [None]:
# Drop rows with missing values in 'self_employed' column
df = df.dropna(subset=['self_employed'])

print(df.isnull().sum())

In [None]:
df.info()

In [None]:
# Check for Percentage of Duplicated rows
duplicates = df.duplicated().mean()
print(f" Percentage of duplicates: {duplicates: .1%}")
print()  # blank line

# Drop duplicated rows
df = df.drop_duplicates()
duplicated = df.duplicated().mean()
print(f" Percentage of duplicated: {duplicated: .1%}")

In [None]:
#droping timestamp


In [None]:
df.drop(columns=['Timestamp'],inplace=True)

In [None]:
df

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
# Check for Percentage of Duplicated rows
duplicates = df.duplicated().mean()
print(f" Percentage of duplicates: {duplicates: .1%}")
print()  # blank line



In [None]:
df

In [None]:
df.drop(columns=['mental_health_interview'],inplace=True)

In [None]:
def countsplot(category):
  """
  Creates a count plot and returns a DataFrame with counts and percentages.

  Args:
      category (str): The column name for the categorical variable to plot.

  Returns:
      pandas.DataFrame: A DataFrame containing counts and percentages for each category.
  """

  # Create count plot with dynamic title
  sns.countplot(data = df, x = category)
  plt.ylabel("Count")
  plt.title(f"{category} Distribution")
  plt.show()
  # Calculate counts and percentages
  category_counts = df[category].value_counts()
  category_percentages = df[category].value_counts(normalize = True)

  # Combine counts and percentages into a DataFrame
  stats_df = pd.DataFrame({"Count": category_counts, "Proportion": category_percentages})
  stats_df["Count"] = stats_df["Count"].apply("{:,}".format)
  stats_df["Proportion"] = stats_df["Proportion"].apply("{:.1%}".format)

  return stats_df


In [None]:
countsplot("Gender")

In [None]:
countsplot("Coping_Struggles")

In [None]:
#call function 


In [None]:
# Group data by Gender then filter by Occupation and count occurrences
gender_data = (
    df
    .groupby("Gender")["Occupation"]
    .value_counts()
    .reset_index(name = "Count")
    .sort_values(by = "Count", ascending = False)
)

sns.barplot(data = gender_data, x = "Occupation", y = "Count", hue = "Gender")
plt.title("Occupation by Gender")
plt.show()
gender_data

# DATA PREPROCESSING

In [22]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [23]:
# Create a LabelEncoder object
le = LabelEncoder()

# Apply LabelEncoder to each column
encoded_df = df.apply(le.fit_transform)

encoded_df.head()

Unnamed: 0,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,care_options
3,0,34,1,0,1,1,0,2,1,2,2,0,1,2,2
4,0,34,1,0,1,1,0,2,1,2,2,0,1,2,2
5,0,25,1,0,0,1,0,2,1,2,2,0,1,2,1
6,0,0,1,0,1,1,0,2,1,2,2,0,1,2,1
7,0,34,1,0,0,0,0,2,1,2,2,0,1,2,0


In [24]:
X = encoded_df.drop("Mood_Swings", axis = 1)

y = encoded_df["Mood_Swings"]
y

3         2
4         2
5         2
6         2
7         2
         ..
292359    1
292360    1
292361    1
292362    1
292363    1
Name: Mood_Swings, Length: 284858, dtype: int32

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(227886, 14) (56972, 14)
(227886,) (56972,)


In [None]:
# Import libraries for model evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_curve,
    auc,
    RocCurveDisplay,
)

In [None]:
# Import libraries for machine learning
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
def pipeline_classification(pipelines):
  """
  Performs classification using cross-validation, evaluates different models,
  and makes predictions for each model on the test set.

  Args:
      pipelines (list): List of tuples containing model names and pipeline objects.

  Returns:
      pandas.DataFrame: A DataFrame containing model names, mean accuracy, standard deviation,
                        and a dictionary with test set predictions for each model.
  """

  # Initialize lists to store results
  cv_results = []
  model_names = []
  model_predictions = {}
  model_probabilities = {}

  # Perform cross-validation and store results
  for name, pipe in pipelines:
      # Fit the pipeline to the training data
      pipe.fit(X_train, y_train)

      kfold = KFold(n_splits = 10, shuffle = True)
      cv_results.append(cross_val_score(pipe, X_train, y_train, cv = kfold, scoring = 'accuracy', n_jobs = -1))
      model_names.append(name)

      # Make predictions on the test set and store them
      model_predictions[name] = pipe.predict(X_test)
    # Calculate probabilities on the test set and store them
      model_probabilities[name] = pipe.predict_proba(X_test)

  # Plot the results
  plt.boxplot(cv_results)
  plt.xticks(range(1, len(model_names) + 1), model_names)
  plt.title('Algorithm Comparison')
  plt.xlabel('Model')
  plt.ylabel('Accuracy')
  plt.show()

  # Convert results to a DataFrame
  results_df = pd.DataFrame({
      'Model': model_names,
      'Mean Accuracy': [scores.mean() for scores in cv_results],
      'Standard Deviation': [scores.std() for scores in cv_results]
  })

  return results_df, model_predictions, model_probabilities

In [None]:
# Define the list of pipelines
pipelines = [
    ('DT', Pipeline([('scaler', MinMaxScaler()), ('DT', DecisionTreeClassifier())])),
    ('RF', Pipeline([('scaler', MinMaxScaler()), ('RF', RandomForestClassifier())])),
  ]

results_df, model_predictions, model_probabilities = pipeline_classification(pipelines)
results_df

In [None]:
def get_model_scores(models, predictions, y_test, average = "None"):
  """
  Calculates and returns precision, recall, and F1 scores for each model.

  Args:
      models: A list of trained machine learning models.
      predictions: A list of predictions for each model, corresponding to the models list.
      y_test: True labels for the test set.
      average (str, optional): Averaging type for metrics.

  Returns:
      A Pandas DataFrame containing the model scores.
  """

  scores = [{
      'Model': model_name,
      'Accuracy': round(accuracy_score(y_test, y_pred), 3),
      'Precision': round(precision_score(y_test, y_pred, average = average), 3),
      'Recall': round(recall_score(y_test, y_pred, average = average), 3),
      'F1 Score': round(f1_score(y_test, y_pred, average = average), 3)
  } for model_name, y_pred in zip(models, predictions)]

  scores_df = pd.DataFrame(scores)

  return scores_df

In [1]:
scores = get_model_scores(models, predictions, y_test, average = "macro")

scores

NameError: name 'get_model_scores' is not defined