In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/lifestyle-and-wellbeing-data/Wellbeing_and_lifestyle_data.csv')
print(data.shape)
data.head()

# Preprocessing

In [None]:
data['GENDER'] = data['GENDER'].map({'Female':0, 'Male':1})
data = data[data.DAILY_STRESS.apply(lambda x: x.isnumeric())]
data['DAILY_STRESS'] = data['DAILY_STRESS'].astype(int)
data['AGE'] = data['AGE'].map({'Less than 20':0, '21 to 35':1, '36 to 50':2, '51 or more':3})
data = data.drop(columns =['Timestamp'])

In [None]:
import matplotlib.pyplot as plot
data['BMI_RANGE'].value_counts().plot.bar()

**1- A proper weight
2 - overweight**

**we can see that the population consist of more people that have proper weight**

**now let's see differences between Categories**

In [None]:
import seaborn as sns

ax = sns.catplot(x="BMI_RANGE", kind="count", hue="AGE", col = "GENDER", data=data)

** Less than 20: 0, 21 to 35: 1, 36 to 50: 2, 51 or more: 3

Female: 0, Male: 1**

**We see that there is more womans with overweight in the population than mans.
And the overweight over the years tend to change more in mans population than in women's**

**A big change can be seen in men between the ages of 21 to 35 and 36 and up (probably because it is an age after marriage and good food awaits him at home)** :D



# Now let's see data about daily stress 

In [None]:
x = data[['DAILY_STRESS', 'GENDER']].groupby(['GENDER'])['DAILY_STRESS'].agg(['mean']).reset_index().set_index('GENDER')
x = x.rename({'mean':'Daily Stress Mean'}, axis=1)
x.plot.bar()

**We see that womans are more daily stressed than mans in average**

quote from report "Some 200,000 men reported work-related stress averaged over the past three years compared to 272,000 women, according to the HSE’s figures. This means women were 1.4 times more likely to suffer from stress, anxiety and depression"

In [None]:
ax = sns.catplot(x="TIME_FOR_PASSION", kind="count", col = "GENDER", data=data)

**We can see here that womans take less time for passions durring the day**

**Maybe this could affects their daily stress**

In [None]:
ax = sns.catplot(x="DAILY_MEDITATION", kind="count", col = "GENDER", data=data)

**And womans does daily meditation more than mans**

In [None]:
print(data.isnull().sum())
   
data.info()


# Find corelation between features 

**This chart i take from another notebook to show the overall correlation between features and how each feature affect another**
here is the link https://www.kaggle.com/fatoubd/3-things-to-do-for-a-work-life-balance#4.-Evaluate-the-results

In [None]:
corrmat = data.corr() 
map_corr= sns.clustermap(corrmat, cmap ="YlGnBu", linewidths = 0.1)
plot.setp(map_corr.ax_heatmap.yaxis.get_majorticklabels(), rotation = 0) 
map_corr.ax_heatmap.set_xticklabels(map_corr.ax_heatmap.get_xmajorticklabels(), fontsize = 16)
map_corr.ax_heatmap.set_yticklabels(map_corr.ax_heatmap.get_ymajorticklabels(), fontsize = 16)

map_corr
plot.savefig('heatmap.png', dpi=300, bbox_inches='tight')

It is interesting to see that among the highest correlation there is :

Time for passion- Flow

Personal Awards - Achievement

'time for passion' is the time we dedicate to what we enjoy doing. It can be work, hobbies or volunteering. It is important to have time for passion in our busy lives because it is highly correlated with the 'flow' which is ,as described in the survey: "Flow is defined as the mental state, in which you are fully immersed in performing an activity. You then experience a feeling of energized focus, full involvement, and enjoyment in the process of this activity." Mihaly Csikszentmihalyi decribed it as " the secret to happiness"

According to the correlation maps people doing what they are passionate about have more personal awards and achivements in their life. and it correlate to todo completed as well. in overall we can say that they are more successfull

In [None]:
to_pred = data['BMI_RANGE']

print('BMI corr: ')
print(data.corr()['BMI_RANGE'].sort_values(ascending = False))

print('Stress corr: ')
print(data.corr()['DAILY_STRESS'].sort_values(ascending = False))



# drop column to predict and checks values of data

In [None]:
data_for_prediction = data.drop(columns =['BMI_RANGE'])
data_for_prediction.info()

print(len(data_for_prediction.columns))


# Random forest classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(data_for_prediction,to_pred,test_size =0.2)
model = RandomForestClassifier()
model.fit(X_train,y_train)
prediction = model.predict(X_test)
score = accuracy_score(y_test, prediction)
score

I changed the max depth many times and it was better without

# Select 9 best features

In [None]:
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest, SelectFpr
data_new = SelectKBest(f_classif,k=9).fit_transform(data_for_prediction,to_pred)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_new,to_pred,test_size =0.2)
model = RandomForestClassifier()
model.fit(X_train,y_train)
prediction = model.predict(X_test)
score = accuracy_score(y_test, prediction)
score

it seems that it does not help improve the model accuracy, there is 9 features that correlate with BMI

# Took all the features who has most relevent correlation - 9 features

In [None]:
data_after_corr = data[['AGE','DAILY_STRESS','DONATION','DAILY_SHOUTING','LOST_VACATION','SUPPORTING_OTHERS','SOCIAL_NETWORK','PERSONAL_AWARDS','FLOW']] 

In [None]:
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(data_after_corr,to_pred,test_size =0.2)
model = RandomForestClassifier()
model.fit(new_X_train,new_y_train)
new_prediction_corr = model.predict(new_X_test)
score = accuracy_score(new_y_test, new_prediction_corr)
score

# Predict with different accuracy function 

In [None]:
from sklearn.metrics import f1_score
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(data_for_prediction,to_pred,test_size =0.2)
model = RandomForestClassifier()
model.fit(new_X_train,new_y_train)
new_prediction_corr = model.predict(new_X_test)
score = f1_score(new_y_test, new_prediction_corr,zero_division=1)
score

We see that the accuracy function for imbalance dataset improve the accuracy score.
this accuracy function is (1/2)*(TP/(TP+FN)+TN/(TN+FP)) where TP, TN, FP, and FN refers to true positive, true negative,false positive,and false negative, respectively
It maybe usefull here because there is more population(data) with proper weight than overweight

# Decision Tree classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(data_for_prediction,to_pred,test_size =0.2)
x = DecisionTreeClassifier(max_depth = 4)
x.fit(new_X_train,new_y_train)
pred = x.predict(new_X_test)
score = f1_score(new_y_test, pred,zero_division=1)
score

RandomForest is better