In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
# Load the dataset
data = pd.read_csv('preprocessed_data.csv')
numerical_columns = ['FRUITS_VEGGIES', 'DAILY_STRESS', 'PLACES_VISITED', 'CORE_CIRCLE', 'SUPPORTING_OTHERS', 'SOCIAL_NETWORK', 'ACHIEVEMENT',
                    'DONATION', 'BMI_RANGE', 'TODO_COMPLETED', 'FLOW', 'DAILY_STEPS', 'LIVE_VISION', 'SLEEP_HOURS', 'LOST_VACATION',
                    'DAILY_SHOUTING', 'SUFFICIENT_INCOME', 'PERSONAL_AWARDS', 'TIME_FOR_PASSION', 'WEEKLY_MEDITATION', 'GENDER', 'WORK_LIFE_BALANCE_SCORE',
                    'AGE_21 to 35', 'AGE_36 to 50', 'AGE_51 or more', 'AGE_Less than 20']
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [3]:
# Define the feature matrix and target variable
X = data[['CORE_CIRCLE', 'SOCIAL_NETWORK', 'SUPPORTING_OTHERS']]
y = data['WORK_LIFE_BALANCE_SCORE']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean squared error:', mse)
print('R-squared:', r2)

Mean squared error: 0.5370179784754726
R-squared: 0.4521635144886853


In [4]:
# Print coefficients
print("Coefficients:", model.coef_)

Coefficients: [0.31760954 0.19606225 0.38049037]


<!-- The coefficients you've obtained represent the relationship between each social factor and the work-life balance score. Here's a brief interpretation of each coefficient:

CORE_CIRCLE_SIZE: The coefficient of 0.3176 indicates that for each one-unit increase in the core circle size, the work-life balance score is expected to increase by approximately 0.3176, holding all other variables constant.

SOCIAL_NETWORK_SIZE: The coefficient of 0.1961 shows that for each one-unit increase in the social network size, the work-life balance score is expected to increase by approximately 0.1961, holding all other variables constant.

SUPPORTING_OTHERS: The coefficient of 0.3805 suggests that for each one-unit increase in supporting others, the work-life balance score is expected to increase by approximately 0.3805, holding all other variables constant.

These results indicate that all three social factors have a positive relationship with the work-life balance score. -->

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean squared error:', mse)
print('R-squared:', r2)

Mean squared error: 0.5752760342246299
R-squared: 0.4131347302688074


In [9]:
# Extract feature importance values
importances = rf_model.feature_importances_

# Create a DataFrame with feature names and their corresponding importance values
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the DataFrame by importance values in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)

             Feature  Importance
2  SUPPORTING_OTHERS    0.547151
0        CORE_CIRCLE    0.298213
1     SOCIAL_NETWORK    0.154636


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean squared error:", mse)
print("R-squared:", r2)

Mean squared error: 0.530556391994573
R-squared: 0.45875527299660857


In [12]:
# Extract feature importance values
importances = model.feature_importances_

# Create a DataFrame with feature names and their corresponding importance values
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the DataFrame by importance values in descending order
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)

             Feature  Importance
2  SUPPORTING_OTHERS    0.641574
0        CORE_CIRCLE    0.236800
1     SOCIAL_NETWORK    0.121626


<!-- From all 3 models we can conclude that SUPPORTING_OTHERS has the strongest effect on the work-life balance score, followed by CORE_CIRCLE_SIZE and then SOCIAL_NETWORK_SIZE. -->