In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd


In [3]:
# Load the dataset
df = pd.read_csv('health_recommendation_combinations_with_severity_tips.csv')


In [4]:
print("Dataset Preview:")
print(df.head())

Dataset Preview:
  Age Group Sleep Hours Work Hours Physical Activities  \
0     15-19          <6         <6                walk   
1     15-19          <6         <6                walk   
2     15-19          <6         <6                walk   
3     15-19          <6         <6                walk   
4     15-19          <6         <6                walk   

                     Mental State Severity  \
0  Relationship and Family Issues     High   
1  Relationship and Family Issues   Medium   
2  Relationship and Family Issues      Low   
3                 Family Dynamics     High   
4                 Family Dynamics   Medium   

                                     Recommendations  
0  ['Communicate openly with family members.', 'C...  
1  ['Communicate openly with family members.', 'C...  
2  ['Communicate openly with family members.', 'C...  
3  ['Plan family activities to strengthen bonds.'...  
4  ['Plan family activities to strengthen bonds.'...  


In [5]:
# Check for missing values in the dataset
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Age Group              0
Sleep Hours            0
Work Hours             0
Physical Activities    0
Mental State           0
Severity               0
Recommendations        0
dtype: int64


In [6]:
df["Age Group"].unique()

array(['15-19', '20-24', '25-30'], dtype=object)

In [7]:
df["Mental State"].unique(),

(array(['Relationship and Family Issues', 'Family Dynamics',
        'Sexual and Reproduction Issues', 'Anxiety', 'Hormonal Changes',
        'Body Image', 'Eating Disorder', 'Depression', 'Suicidal Thoughts'],
       dtype=object),)

In [8]:
df["Physical Activities"].unique(),

(array(['walk', 'run', 'yoga', 'gym', 'no-exercise'], dtype=object),)

In [9]:
df["Recommendations"].unique(),

(array(["['Communicate openly with family members.', 'Consider family counseling.', 'Set clear boundaries in relationships.', 'Practice active listening in conversations.', 'Spend quality time with loved ones.']",
        "['Plan family activities to strengthen bonds.', 'Encourage family meetings to discuss issues.', 'Respect individual opinions within the family.', 'Teach children about empathy and understanding.', 'Develop a routine to spend time together.']",
        "['Consult a qualified medical professional.', 'Educate yourself on reproductive health.', 'Practice safe sex methods.', 'Seek support groups for sexual health issues.', 'Discuss concerns openly with a partner.']",
        "['Practice deep breathing exercises daily.', 'Engage in mindfulness meditation.', 'Exercise regularly to reduce stress.', 'Avoid excessive caffeine or sugar intake.', 'Develop a consistent sleep schedule.']",
        "['Maintain a balanced diet rich in nutrients.', 'Exercise regularly to manage hormo

In [10]:
df["Severity"].unique(),

(array(['High', 'Medium', 'Low'], dtype=object),)

In [11]:
df["Sleep Hours"].unique(),

(array(['<6', '6-8', '>8'], dtype=object),)

In [12]:
df["Work Hours"].unique(),

(array(['<6', '6-8', '>8'], dtype=object),)

In [13]:
# Get basic information about the dataset (e.g., data types, number of entries)
print("\nDataset Information:")
df.info()




Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3645 entries, 0 to 3644
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age Group            3645 non-null   object
 1   Sleep Hours          3645 non-null   object
 2   Work Hours           3645 non-null   object
 3   Physical Activities  3645 non-null   object
 4   Mental State         3645 non-null   object
 5   Severity             3645 non-null   object
 6   Recommendations      3645 non-null   object
dtypes: object(7)
memory usage: 199.5+ KB


In [14]:

# Drop any rows with missing values (if any exist)
df.dropna(inplace=True)

In [15]:
# Encode categorical features
label_encoders = {}
for col in df.columns[:-1]:  # Exclude target column 'Recommendations'
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Save the encoder for future decoding


In [16]:
# Encode the target variable (Recommendations)
y_encoder = LabelEncoder()
df['Recommendations'] = y_encoder.fit_transform(df['Recommendations'])


In [17]:
# Define features (X) and target labels (y)
X = df.drop(columns=['Recommendations'])
y = df['Recommendations']


In [18]:
X

Unnamed: 0,Age Group,Sleep Hours,Work Hours,Physical Activities,Mental State,Severity
0,0,1,1,3,6,0
1,0,1,1,3,6,2
2,0,1,1,3,6,1
3,0,1,1,3,4,0
4,0,1,1,3,4,2
...,...,...,...,...,...,...
3640,2,2,2,1,2,2
3641,2,2,2,1,2,1
3642,2,2,2,1,8,0
3643,2,2,2,1,8,2


In [19]:
y

0        7
1        7
2        7
3       23
4       23
        ..
3640    31
3641    31
3642    15
3643    15
3644    15
Name: Recommendations, Length: 3645, dtype: int64

In [20]:
print(X[:10])
print("-------------------")
print(y[:10])

   Age Group  Sleep Hours  Work Hours  Physical Activities  Mental State  \
0          0            1           1                    3             6   
1          0            1           1                    3             6   
2          0            1           1                    3             6   
3          0            1           1                    3             4   
4          0            1           1                    3             4   
5          0            1           1                    3             4   
6          0            1           1                    3             7   
7          0            1           1                    3             7   
8          0            1           1                    3             7   
9          0            1           1                    3             0   

   Severity  
0         0  
1         2  
2         1  
3         0  
4         2  
5         1  
6         0  
7         2  
8         1  
9         0  
---------

In [21]:
original_labels = y_encoder.classes_
print(original_labels)

["['Avoid comparing yourself to others on social media.', 'Focus on the strengths and capabilities of your body.', 'Wear clothes that make you feel confident.', 'Surround yourself with positive and supportive people.', 'Engage in physical activities you enjoy.', 'Focus on maintaining healthy sleep patterns.', 'Engage in light physical activities like walking.', 'Spend more time in nature to rejuvenate your mind.', 'Read books or listen to calming music.', 'Practice gratitude and mindfulness exercises.']"
 "['Avoid comparing yourself to others on social media.', 'Focus on the strengths and capabilities of your body.', 'Wear clothes that make you feel confident.', 'Surround yourself with positive and supportive people.', 'Engage in physical activities you enjoy.', 'Immediately consult a mental health professional.', 'Reduce work hours to avoid burnout.', 'Adopt a sleep hygiene routine to improve rest.', 'Avoid making critical decisions under stress.', 'Seek help from a trusted crisis sup

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3645 entries, 0 to 3644
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Age Group            3645 non-null   int64
 1   Sleep Hours          3645 non-null   int64
 2   Work Hours           3645 non-null   int64
 3   Physical Activities  3645 non-null   int64
 4   Mental State         3645 non-null   int64
 5   Severity             3645 non-null   int64
 6   Recommendations      3645 non-null   int64
dtypes: int64(7)
memory usage: 199.5 KB


In [23]:
df["Age Group"].unique(),


(array([0, 1, 2]),)

In [24]:
df["Mental State"].unique(),


(array([6, 4, 7, 0, 5, 1, 3, 2, 8]),)

In [25]:
df["Physical Activities"].unique(),


(array([3, 2, 4, 0, 1]),)

In [26]:
df["Recommendations"].unique(),


(array([ 7, 23, 11, 27, 19,  3, 35, 31, 15,  5, 21,  9, 25, 17,  1, 33, 29,
        13,  6, 22, 10, 26, 18,  2, 34, 30, 14,  4, 20,  8, 24, 16,  0, 32,
        28, 12]),)

In [27]:
df["Severity"].unique(),


(array([0, 2, 1]),)

In [28]:
df["Sleep Hours"].unique(),


(array([1, 0, 2]),)

In [29]:
df["Work Hours"].unique(),

(array([1, 0, 2]),)

In [30]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [32]:
# Save the encoders and scaler for later use in Flask app
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('y_encoder.pkl', 'wb') as f:
    pickle.dump(y_encoder, f)

In [33]:
# Model training
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(solver='saga', max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}


In [34]:
# Evaluate models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Model Accuracy: {accuracy:.2f}")

Random Forest Model Accuracy: 0.98
Logistic Regression Model Accuracy: 0.89
Decision Tree Model Accuracy: 1.00


In [35]:
# Save the best model (Decision Tree as an example)
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(models["Decision Tree"], f)

In [None]:
# Load the trained model
import numpy as np

with open("decision_tree_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

# Define a sample input **without column names**
sample_input = np.array([[1, 1, 2, 2, 4, 0]])  # Use NumPy array instead of DataFrame

# Make a prediction
prediction = model.predict(sample_input)
decoded_predictions = y_encoder.inverse_transform(prediction)

for i in range():  # Show first 5 predictions
    print(f"Decoded: {decoded_predictions[i]}")

Decoded: ['Contact a crisis hotline immediately.', 'Talk to a trusted family member or friend.', 'Remove access to harmful objects or substances.', 'Seek professional mental health support.', 'Engage in mindfulness to calm intrusive thoughts.']


IndexError: index 1 is out of bounds for axis 0 with size 1