In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle


In [37]:
# Load the dataset
df = pd.read_csv('health_recommendation_combinations_with_severity_tips.csv')


In [38]:
print("Dataset Preview:")
print(df.head())

Dataset Preview:
  Age Group Sleep Hours Work Hours Physical Activities  \
0     15-19          <6         <6                walk   
1     15-19          <6         <6                walk   
2     15-19          <6         <6                walk   
3     15-19          <6         <6                walk   
4     15-19          <6         <6                walk   

                     Mental State Severity  \
0  Relationship and Family Issues     High   
1  Relationship and Family Issues   Medium   
2  Relationship and Family Issues      Low   
3                 Family Dynamics     High   
4                 Family Dynamics   Medium   

                                     Recommendations  
0  ['Communicate openly with family members.', 'C...  
1  ['Communicate openly with family members.', 'C...  
2  ['Communicate openly with family members.', 'C...  
3  ['Plan family activities to strengthen bonds.'...  
4  ['Plan family activities to strengthen bonds.'...  


In [39]:
# Check for missing values in the dataset
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Age Group              0
Sleep Hours            0
Work Hours             0
Physical Activities    0
Mental State           0
Severity               0
Recommendations        0
dtype: int64


In [40]:
# Get basic information about the dataset (e.g., data types, number of entries)
print("\nDataset Information:")
df.info()




Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3645 entries, 0 to 3644
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Age Group            3645 non-null   object
 1   Sleep Hours          3645 non-null   object
 2   Work Hours           3645 non-null   object
 3   Physical Activities  3645 non-null   object
 4   Mental State         3645 non-null   object
 5   Severity             3645 non-null   object
 6   Recommendations      3645 non-null   object
dtypes: object(7)
memory usage: 199.5+ KB


In [41]:

# Drop any rows with missing values (if any exist)
df.dropna(inplace=True)

In [42]:
# Encode categorical features
label_encoders = {}
for col in df.columns[:-1]:  # Exclude target column 'Recommendations'
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Save the encoder for future decoding


In [43]:
# Define features (X) and target labels (y)
X = df.drop(columns=['Recommendations'])
y = df['Recommendations']


In [44]:
# Encode the target variable (Recommendations)
df['Recommendations'] = LabelEncoder().fit_transform(df['Recommendations'])


In [45]:
# Splitting the dataset
X = df.drop(columns=['Recommendations'])
y = df['Recommendations']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [46]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [47]:
# Model training
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(solver='saga', max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}


In [48]:
# Evaluate models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Model Accuracy: {accuracy:.2f}")



Random Forest Model Accuracy: 0.98
Logistic Regression Model Accuracy: 0.89
Decision Tree Model Accuracy: 1.00
