# Imports

In [None]:
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import itertools
from statsmodels.formula.api import ols

# Data Preprocessing for Statistical Analysis

In [None]:
url = 'https://docs.google.com/spreadsheets/d/1gIxdXSMysCDWhsoB65k2T6YuW4eKZefjCap3nMCWiXM/export?format=csv'
df = pd.read_csv(url)
df.tail()

Unnamed: 0,Timestamp,Column 1,Age,Work/Study Hours per Day,"Overall Productivity Rating\nOn a scale of 1 to 5, how would you rate your daily productivity?","How consistent is your productivity throughout the day?\nOn a scale of 1 to 5, how would you rate your productivity consistency?",Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Early Morning],Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Morning],Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Afternoon],Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Evening],Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Late Night],"Frequency of Caffeinated Beverage Consumption\nHow many cups (6 - 8 oz) of caffeinated beverages (coffee, tea, energy drinks) do you consume in a day?\n(Enter a valid number between 0 to 10)",At what time(s) of day do you usually consume caffeinated beverages? \n(Select all that apply),Daily Hours of Sleep\nHow many hours do you sleep daily? \n(Enter a valid number between 0 to 16),"Do you have a consistent sleep schedule \n(i.e., going to bed and waking up at the same times each day)?","Frequency of Physical Exercise \nOn a scale of 0 to 7, how often do you engage in physical exercise per week?","Energy level throughout the day\nDuring the day do you consistently notice fluctuations in your energy levels, such as times of higher or lower energy?",What time of day do you usually engage in physical activity?,Email Address
39,12/2/2024 13:24:48,I Accept,21,4,4,4,3,5,4,4,2,1.0,"Morning, Evening",7,4,4,4 - High energy for most of the day,Morning,
40,12/2/2024 13:37:37,I Accept,25,3,3,3,4,4,4,2,2,1.0,Morning,7,4,6,3 - Energy constant/neutral at any given time ...,Morning,
41,12/2/2024 15:55:23,I Accept,24,3,2,3,3,3,2,2,1,1.0,Morning,7,2,1,2 - Low energy for most of the day,Morning,
42,12/2/2024 18:29:28,I Accept,26,8,4,3,5,5,3,3,5,1.0,Morning,7,3,7,3 - Energy constant/neutral at any given time ...,"Early Morning, Morning",
43,12/3/2024 23:56:45,I Accept,24,10,4,3,4,5,3,2,4,3.0,Evening,8,3,3,3 - Energy constant/neutral at any given time ...,"Early Morning, Evening",


# Clean data

In [None]:
df.columns

Index(['Timestamp', 'Column 1', 'Age', 'Work/Study Hours per Day',
       'Overall Productivity Rating\nOn a scale of 1 to 5, how would you rate your daily productivity?',
       'How consistent is your productivity throughout the day?\nOn a scale of 1 to 5, how would you rate your productivity consistency?',
       'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Early Morning]',
       'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Morning]',
       'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Afternoon]',
       'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Evening]',
       'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Prod

In [None]:
def clean_data(df):
    # Make a copy of the dataframe
    df_clean = df.copy()

    new_columns = {
        'Timestamp': 'timestamp',
        'Column 1': 'consent',
        'Age': 'age',
        'Work/Study Hours per Day': 'work_study_hours',
        'Overall Productivity Rating\nOn a scale of 1 to 5, how would you rate your daily productivity?': 'daily_productivity',
        'How consistent is your productivity throughout the day?\nOn a scale of 1 to 5, how would you rate your productivity consistency?': 'productivity_consistency',
        'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Early Morning]': 'early_morning_prod',
        'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Morning]': 'morning_prod',
        'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Afternoon]': 'afternoon_prod',
        'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Evening]': 'evening_prod',
        'Productivity levels throughout the day\nOn the scale of 1 to 5 rate yourself\n1 - Least Productive\n5 - Most Productive [Late Night]': 'late_night_prod',
        'Frequency of Caffeinated Beverage Consumption\nHow many cups (6 - 8 oz) of caffeinated beverages (coffee, tea, energy drinks) do you consume in a day?\n(Enter a valid number between 0 to 10)': 'caffeine_consumption',
        'At what time(s) of day do you usually consume caffeinated beverages? \n(Select all that apply)': 'caffeine_timing',
        'Daily Hours of Sleep\nHow many hours do you sleep daily? \n(Enter a valid number between 0 to 16)': 'sleep_hours',
        'Do you have a consistent sleep schedule \n(i.e., going to bed and waking up at the same times each day)?': 'consistent_sleep',
        'Frequency of Physical Exercise \nOn a scale of 0 to 7, how often do you engage in physical exercise per week?': 'exercise_freq',
        'Energy level throughout the day\nDuring the day do you consistently notice fluctuations in your energy levels, such as times of higher or lower energy?  ': 'energy_fluctuations',
        'What time of day do you usually engage in physical activity?': 'exercise_timing',
        'Email Address': 'email'
    }

    df_clean.rename(columns=new_columns, inplace=True)
    df_clean.drop(columns=["timestamp", "consent", "email"], inplace=True)

    # Handle missing or invalid values
    df_clean['age'] = pd.to_numeric(df_clean['age'], errors='coerce')
    df_clean['work_study_hours'] = pd.to_numeric(df_clean['work_study_hours'], errors='coerce')
    df_clean['daily_productivity'] = pd.to_numeric(df_clean['daily_productivity'], errors='coerce')
    df_clean['productivity_consistency'] = pd.to_numeric(df_clean['productivity_consistency'], errors='coerce')
    df_clean['sleep_hours'] = pd.to_numeric(df_clean['sleep_hours'], errors='coerce')
    df_clean['caffeine_consumption'] = pd.to_numeric(df_clean['caffeine_consumption'], errors='coerce')
    df_clean['exercise_freq'] = pd.to_numeric(df_clean['exercise_freq'], errors='coerce')

    # Clean Energy Level - extract numeric value
    df_clean['energy_fluctuations'] = df_clean['energy_fluctuations'].str.extract('(\d+)').astype(int)

    # Create dummy variables for time-based categories
    caffeine_dummies = df_clean['caffeine_timing'].str.get_dummies(sep=', ')
    exercise_dummies = df_clean['exercise_timing'].str.get_dummies(sep=', ')

    # Standardize column names: lowercase and prefix with context
    caffeine_dummies.columns = [f'caffeine_{col.lower().replace(" ", "_")}' for col in caffeine_dummies.columns]
    exercise_dummies.columns = [f'exercise_{col.lower().replace(" ", "_")}' for col in exercise_dummies.columns]

    # Concatenate dummy variables and drop original columns
    df_clean = pd.concat([df_clean, caffeine_dummies, exercise_dummies], axis=1)
    df_clean.drop(columns=['caffeine_timing', 'exercise_timing'], inplace=True)

    return df_clean

df_clean = clean_data(df)
df_clean.head()

Unnamed: 0,age,work_study_hours,daily_productivity,productivity_consistency,early_morning_prod,morning_prod,afternoon_prod,evening_prod,late_night_prod,caffeine_consumption,...,caffeine_afternoon,caffeine_evening,caffeine_late_night,caffeine_morning,caffeine_never,exercise_afternoon,exercise_early_morning,exercise_evening,exercise_morning,exercise_never
0,40,12,4,4,4,4,4,3,2,3.0,...,0,0,0,1,0,1,0,0,0,0
1,24,1,1,2,1,1,1,1,1,0.0,...,0,0,0,0,1,0,0,1,0,0
2,23,10,4,4,4,5,2,5,2,1.5,...,0,1,0,1,0,1,0,0,0,0
3,24,8,4,3,2,3,5,4,5,0.0,...,0,0,0,0,1,1,0,1,0,0
4,23,8,5,5,5,5,5,5,5,1.0,...,0,1,0,1,0,0,1,1,1,0


In [None]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       44 non-null     int64  
 1   work_study_hours          44 non-null     int64  
 2   daily_productivity        44 non-null     int64  
 3   productivity_consistency  44 non-null     int64  
 4   early_morning_prod        44 non-null     int64  
 5   morning_prod              44 non-null     int64  
 6   afternoon_prod            44 non-null     int64  
 7   evening_prod              44 non-null     int64  
 8   late_night_prod           44 non-null     int64  
 9   caffeine_consumption      44 non-null     float64
 10  sleep_hours               44 non-null     int64  
 11  consistent_sleep          44 non-null     int64  
 12  exercise_freq             44 non-null     int64  
 13  energy_fluctuations       44 non-null     int64  
 14  caffeine_aft

### **Column Classification**
|Columns | Type |
|-|-|
|`age`| Appears numeric but is often treated as a discrete variable.|
|`daily_productivity`, `productivity_consistency`, `early_morning_prod`, `morning_prod`, `afternoon_prod`, `evening_prod`, `late_night_prod`| Ordinal data (Likert scale)|
|`work_study_hours`, `caffeine_consumption`, `sleep_hours`, `consistent_sleep`, `exercise_freq`, `energy_fluctuations`| Discrete numeric|
|`caffeine_*`, `exercise_*`| Binary discrete variables (One-hot encoded)|


# Analysis

In [None]:
sleep_vars = ['sleep_hours', 'consistent_sleep']
exercise_vars = ['exercise_freq', 'exercise_afternoon', 'exercise_early_morning', 'exercise_evening', 'exercise_morning', 'exercise_never']
caffeine_vars = ['caffeine_consumption', 'caffeine_afternoon', 'caffeine_evening', 'caffeine_late_night', 'caffeine_morning', 'caffeine_never']
productivity_vars_out = ['daily_productivity', 'productivity_consistency']
prod_vars = ['early_morning_prod', 'morning_prod', 'afternoon_prod', 'evening_prod', 'late_night_prod']

In [None]:
df_clean.columns

Index(['age', 'work_study_hours', 'daily_productivity',
       'productivity_consistency', 'early_morning_prod', 'morning_prod',
       'afternoon_prod', 'evening_prod', 'late_night_prod',
       'caffeine_consumption', 'sleep_hours', 'consistent_sleep',
       'exercise_freq', 'energy_fluctuations', 'caffeine_afternoon',
       'caffeine_evening', 'caffeine_late_night', 'caffeine_morning',
       'caffeine_never', 'exercise_afternoon', 'exercise_early_morning',
       'exercise_evening', 'exercise_morning', 'exercise_never'],
      dtype='object')

## Multiple Linear Regression Analysis

In [None]:
numerical_columns = df_clean.select_dtypes(include=['int64']).columns
numerical_columns

Index(['age', 'work_study_hours', 'daily_productivity',
       'productivity_consistency', 'early_morning_prod', 'morning_prod',
       'afternoon_prod', 'evening_prod', 'late_night_prod', 'sleep_hours',
       'consistent_sleep', 'exercise_freq', 'energy_fluctuations',
       'caffeine_afternoon', 'caffeine_evening', 'caffeine_late_night',
       'caffeine_morning', 'caffeine_never', 'exercise_afternoon',
       'exercise_early_morning', 'exercise_evening', 'exercise_morning',
       'exercise_never'],
      dtype='object')

In [None]:
df_numerical = df_clean[numerical_columns]
df_numerical.head()

Unnamed: 0,age,work_study_hours,daily_productivity,productivity_consistency,early_morning_prod,morning_prod,afternoon_prod,evening_prod,late_night_prod,sleep_hours,...,caffeine_afternoon,caffeine_evening,caffeine_late_night,caffeine_morning,caffeine_never,exercise_afternoon,exercise_early_morning,exercise_evening,exercise_morning,exercise_never
0,40,12,4,4,4,4,4,3,2,7,...,0,0,0,1,0,1,0,0,0,0
1,24,1,1,2,1,1,1,1,1,10,...,0,0,0,0,1,0,0,1,0,0
2,23,10,4,4,4,5,2,5,2,7,...,0,1,0,1,0,1,0,0,0,0
3,24,8,4,3,2,3,5,4,5,9,...,0,0,0,0,1,1,0,1,0,0
4,23,8,5,5,5,5,5,5,5,7,...,0,1,0,1,0,0,1,1,1,0


In [None]:
# DV - daily_productivity, and productivity_consistency
X = df_numerical.drop(columns=['daily_productivity', 'productivity_consistency', 'caffeine_afternoon',
       'caffeine_evening', 'caffeine_late_night', 'caffeine_morning',
       'caffeine_never', 'exercise_afternoon', 'exercise_early_morning',
       'exercise_evening', 'exercise_morning', 'exercise_never', 'early_morning_prod', 'morning_prod', 'afternoon_prod', 'evening_prod', 'late_night_prod'])  # Features
y1 = df_numerical['daily_productivity']
y2 = df_numerical['productivity_consistency']

# Add constant for statsmodels
X = sm.add_constant(X)

model1 = sm.OLS(y1, X).fit()
model2 = sm.OLS(y2, X).fit()

summary1 = model1.summary()
summary2 = model2.summary()

# Without Scaling

### y1 column is 'productivity_rating'

In [None]:
print(summary1)

                            OLS Regression Results                            
Dep. Variable:     daily_productivity   R-squared:                       0.597
Model:                            OLS   Adj. R-squared:                  0.531
Method:                 Least Squares   F-statistic:                     9.120
Date:                Sat, 07 Dec 2024   Prob (F-statistic):           3.87e-06
Time:                        04:33:21   Log-Likelihood:                -31.643
No. Observations:                  44   AIC:                             77.29
Df Residuals:                      37   BIC:                             89.77
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   4.1996    

**Key Highlights of the OLS Regression Analysis**

- **Model Summary**: The multiple linear regression model explained approximately 59.7% of the variance in daily productivity (R-squared = 0.597). The adjusted R-squared value of 0.531 indicates that the model accounts for over 53% of the variability in daily productivity after adjusting for the number of predictors. The F-statistic for the model was significant, F(6, 37) = 9.120, p < 0.001, indicating that the model as a whole was statistically significant.

- **Significant Predictors**:
  - **Work/Study Hours**: A significant positive relationship was found between work/study hours and daily productivity (B = 0.075, p = 0.011). This suggests that increased work/study hours are associated with higher productivity.
  - **Sleep Hours**: A negative and significant relationship was observed between sleep hours and daily productivity (B = -0.3088, p = 0.002). This indicates that fewer sleep hours are associated with lower productivity.
  - **Energy Fluctuations**: The variable of energy fluctuations showed a significant positive effect on daily productivity (B = 0.3736, p = 0.013), implying that greater energy stability is linked to higher productivity.

- **Non-Significant Predictors**:
  - **Age**: Age did not significantly predict daily productivity (B = -0.0311, p = 0.131).
  - **Consistent Sleep**: The effect of consistent sleep was not significant (B = 0.1639, p = 0.078).
  - **Exercise Frequency**: The frequency of exercise was also not a significant predictor of daily productivity (B = 0.0175, p = 0.729).

- **Additional Information**:
  - **Diagnostic Statistics**: The Durbin-Watson statistic of 2.352 suggests there is no significant autocorrelation in the residuals. The skewness value of -0.306 and kurtosis of 2.162 indicate that the residuals are approximately normal.
  - **AIC and BIC**: The model's AIC (77.29) and BIC (89.77) were noted, which provide further insight into the model's fit relative to its complexity.

### y2 column is 'productivity_consistency'

In [None]:
print(summary2)

                               OLS Regression Results                               
Dep. Variable:     productivity_consistency   R-squared:                       0.541
Model:                                  OLS   Adj. R-squared:                  0.466
Method:                       Least Squares   F-statistic:                     7.260
Date:                      Sat, 07 Dec 2024   Prob (F-statistic):           3.57e-05
Time:                              04:33:21   Log-Likelihood:                -38.231
No. Observations:                        44   AIC:                             90.46
Df Residuals:                            37   BIC:                             103.0
Df Model:                                 6                                         
Covariance Type:                  nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------

**Key Highlights of the OLS Regression Analysis**

- **Model Overview**: The multiple linear regression model accounted for approximately 54.1% of the variance in productivity consistency (R-squared = 0.541). The adjusted R-squared value of 0.466 suggests that around 46.6% of the variability in productivity consistency is explained by the predictors when adjusting for the number of variables in the model. The F-statistic was significant, F(6, 37) = 7.260, p < 0.001, indicating that the model was a good fit for the data.

- **Significant Predictors**:
  - **Energy Fluctuations**: This variable had a strong positive impact on productivity consistency (B = 0.8288, p < 0.001), suggesting that greater energy stability is associated with higher productivity consistency.

- **Marginally Significant Predictor**:
  - **Exercise Frequency**: The frequency of exercise approached significance with a positive relationship to productivity consistency (B = 0.1109, p = 0.064), indicating a potential link between exercise and productivity consistency, though not statistically significant at the standard 0.05 level.

- **Non-Significant Predictors**:
  - **Age**: Age did not significantly predict productivity consistency (B = -0.0242, p = 0.308).
  - **Work/Study Hours**: The effect of work/study hours was not significant (B = -0.0133, p = 0.684).
  - **Sleep Hours**: Sleep hours did not show a significant relationship with productivity consistency (B = -0.1769, p = 0.112).
  - **Consistent Sleep**: Consistent sleep was not a significant predictor (B = -0.0356, p = 0.737).

- **Additional Information**:
  - **Diagnostic Statistics**: The Durbin-Watson statistic of 2.410 indicates no significant autocorrelation in the residuals. The skewness value of 0.394 and kurtosis of 3.200 suggest that the distribution of residuals is approximately normal.
  - **AIC and BIC**: The AIC value of 90.46 and the BIC value of 103.0 provide further metrics on the model's fit and complexity.

# With Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X.drop('const', axis=1))
X_scaled = sm.add_constant(X_scaled)

### y1 column is 'productivity_rating'

In [None]:
model_scaled_1 = sm.OLS(y1, X_scaled).fit()
print(model_scaled_1.summary())

                            OLS Regression Results                            
Dep. Variable:     daily_productivity   R-squared:                       0.597
Model:                            OLS   Adj. R-squared:                  0.531
Method:                 Least Squares   F-statistic:                     9.120
Date:                Sat, 07 Dec 2024   Prob (F-statistic):           3.87e-06
Time:                        04:33:21   Log-Likelihood:                -31.643
No. Observations:                  44   AIC:                             77.29
Df Residuals:                      37   BIC:                             89.77
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.6677      0.302      8.844      0.0

### y2 column is 'productivity_consistency'

In [None]:
model_scaled_2 = sm.OLS(y2, X_scaled).fit()
print(model_scaled_2.summary())

                               OLS Regression Results                               
Dep. Variable:     productivity_consistency   R-squared:                       0.541
Model:                                  OLS   Adj. R-squared:                  0.466
Method:                       Least Squares   F-statistic:                     7.260
Date:                      Sat, 07 Dec 2024   Prob (F-statistic):           3.57e-05
Time:                              04:33:21   Log-Likelihood:                -38.231
No. Observations:                        44   AIC:                             90.46
Df Residuals:                            37   BIC:                             103.0
Df Model:                                 6                                         
Covariance Type:                  nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

## Correlation Analysis

In [None]:
# X = df_numerical.drop(columns=['daily_productivity', 'productivity_consistency', 'caffeine_afternoon',
#        'caffeine_evening', 'caffeine_late_night', 'caffeine_morning',
#        'caffeine_never', 'exercise_afternoon', 'exercise_early_morning',
#        'exercise_evening', 'exercise_morning', 'exercise_never'])
X_y = df_numerical.drop(columns=['caffeine_afternoon',
       'caffeine_evening', 'caffeine_late_night', 'caffeine_morning',
       'caffeine_never', 'exercise_afternoon', 'exercise_early_morning',
       'exercise_evening', 'exercise_morning', 'exercise_never', 'early_morning_prod', 'morning_prod', 'afternoon_prod', 'evening_prod', 'late_night_prod'])
X_y_scaled = scaler.fit_transform(X_y)

# y = df_numerical[['daily_productivity', 'productivity_consistency']]


# df_scaled = pd.concat([y, pd.DataFrame(X_scaled, columns=X.columns)], axis=1)
# df_scaled

# correlation_matrix = df_scaled.corr().round(3)
correlation_matrix = pd.DataFrame(X_y_scaled, columns=X_y.columns).corr().round(3)
correlation_matrix

Unnamed: 0,age,work_study_hours,daily_productivity,productivity_consistency,sleep_hours,consistent_sleep,exercise_freq,energy_fluctuations
age,1.0,0.401,0.096,0.015,-0.175,0.075,0.137,0.151
work_study_hours,0.401,1.0,0.529,0.233,-0.317,0.106,0.27,0.33
daily_productivity,0.096,0.529,1.0,0.574,-0.436,0.375,0.281,0.543
productivity_consistency,0.015,0.233,0.574,1.0,-0.193,0.311,0.361,0.674
sleep_hours,-0.175,-0.317,-0.436,-0.193,1.0,0.107,-0.02,-0.045
consistent_sleep,0.075,0.106,0.375,0.311,0.107,1.0,0.361,0.463
exercise_freq,0.137,0.27,0.281,0.361,-0.02,0.361,1.0,0.256
energy_fluctuations,0.151,0.33,0.543,0.674,-0.045,0.463,0.256,1.0


In [None]:
import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns.tolist(),
    y=correlation_matrix.index.tolist(),
    colorscale='Teal',
    showscale=True
)

fig.update_layout(
    title='Correlation Matrix of Productivity Factors',
    margin=dict(t=200),  #t-top
    # width=1500,
    # height=1000
)

fig.update_traces(showscale=True, colorbar=dict(thickness=20, len=0.8))
fig.update_layout(template="plotly_dark")
fig.show()

The provided correlation matrix shows the pairwise correlations between different variables in your dataset. Here's a summary of key findings:

### Key Highlights:
- **Strong Positive Correlations**:
  - **Daily Productivity and Productivity Consistency**: There is a moderate to strong positive correlation (r = 0.574), indicating that as daily productivity increases, productivity consistency tends to increase as well.
  - **Energy Fluctuations and Productivity Consistency**: A strong positive correlation (r = 0.674) suggests that higher energy fluctuations are associated with greater productivity consistency.
  - **Energy Fluctuations and Daily Productivity**: A moderate correlation (r = 0.543) indicates that higher energy fluctuations are related to higher daily productivity.
  - **Work/Study Hours and Daily Productivity**: A positive correlation (r = 0.529) suggests that more work/study hours are associated with higher daily productivity.

- **Moderate to Weak Correlations**:
  - **Exercise Frequency and Productivity Consistency**: A moderate positive correlation (r = 0.361) indicates a relationship between exercise frequency and productivity consistency.
  - **Consistent Sleep and Daily Productivity**: A positive correlation (r = 0.375) implies that consistent sleep is associated with higher daily productivity.
  - **Consistent Sleep and Productivity Consistency**: A positive correlation (r = 0.311) shows a relationship between consistent sleep and productivity consistency.
  - **Age and Work/Study Hours**: A positive correlation (r = 0.401) indicates that older participants may tend to have more work/study hours.

- **Weak or Non-Significant Correlations**:
  - **Age and Daily Productivity**: A very weak positive correlation (r = 0.096) suggests little to no association between age and daily productivity.
  - **Sleep Hours and Productivity Variables**: Sleep hours generally show weak or negative correlations with daily productivity and productivity consistency, with the strongest being r = -0.436 for daily productivity.
  - **Exercise Frequency and Other Variables**: The correlation between exercise frequency and most other variables is relatively low, with the highest being r = 0.361 with consistent sleep.

### Interpretation:
- Variables like **energy fluctuations** have significant relationships with both daily productivity and productivity consistency, indicating that maintaining stable energy levels might be beneficial for consistent productivity.
- **Daily productivity** and **productivity consistency** show a moderate to strong relationship, which highlights the importance of maintaining a steady output.
- **Sleep hours** appear to have a negative correlation with productivity measures, suggesting that more sleep might not always correlate positively with higher productivity or consistency.
- **Exercise frequency** shows a weaker relationship overall with productivity measures but still holds some connection to consistent sleep.

This correlation matrix helps inform further analysis, such as regression or more complex modeling, to explore causal relationships and predict outcomes based on these variables.

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Create a pairplot
# sns.pairplot(pd.DataFrame(X_y_scaled, columns=X_y.columns), kind='reg', corner=True)

# # Show the plot
# plt.show()


## 1-way ANOVA

In [None]:
# Conduct one-way ANOVA for all caffeine-related dummy variables against daily_productivity
for col in df_clean.filter(like='caffeine_').columns:
    f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['daily_productivity'],
                                  df_clean[df_clean[col] == 0]['daily_productivity'])
    print(f"One-way ANOVA for {col} vs daily_productivity: F={f_val:.2f}, p={p_val:.4f}")

# Conduct one-way ANOVA for all exercise-related dummy variables against daily_productivity
for col in df_clean.filter(like='exercise_').columns:
    f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['daily_productivity'],
                                  df_clean[df_clean[col] == 0]['daily_productivity'])
    print(f"One-way ANOVA for {col} vs daily_productivity: F={f_val:.2f}, p={p_val:.4f}")



print("\n", "#"*90, "\n")

# Conduct one-way ANOVA for all caffeine-related dummy variables against productivity_consistency
for col in df_clean.filter(like='caffeine_').columns:
    f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['productivity_consistency'],
                                  df_clean[df_clean[col] == 0]['productivity_consistency'])
    print(f"One-way ANOVA for {col} vs productivity_consistency: F={f_val:.2f}, p={p_val:.4f}")

# Conduct one-way ANOVA for all exercise-related dummy variables against productivity_consistency
for col in df_clean.filter(like='exercise_').columns:
    f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['productivity_consistency'],
                                  df_clean[df_clean[col] == 0]['productivity_consistency'])
    print(f"One-way ANOVA for {col} vs productivity_consistency: F={f_val:.2f}, p={p_val:.4f}")



# # Initialize lists to store results
# results = []

# # Conduct one-way ANOVA for all caffeine-related dummy variables against daily_productivity
# for col in df_clean.filter(like='caffeine_').columns:
#     f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['daily_productivity'],
#                                   df_clean[df_clean[col] == 0]['daily_productivity'])
#     results.append({'Variable': col, 'Dependent Variable': 'daily_productivity', 'F-value': f_val, 'p-value': p_val})

# # Conduct one-way ANOVA for all caffeine-related dummy variables against productivity_consistency
# for col in df_clean.filter(like='caffeine_').columns:
#     f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['productivity_consistency'],
#                                   df_clean[df_clean[col] == 0]['productivity_consistency'])
#     results.append({'Variable': col, 'Dependent Variable': 'productivity_consistency', 'F-value': f_val, 'p-value': p_val})

# # Conduct one-way ANOVA for all exercise-related dummy variables against daily_productivity
# for col in df_clean.filter(like='exercise_').columns:
#     f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['daily_productivity'],
#                                   df_clean[df_clean[col] == 0]['daily_productivity'])
#     results.append({'Variable': col, 'Dependent Variable': 'daily_productivity', 'F-value': f_val, 'p-value': p_val})

# # Conduct one-way ANOVA for all exercise-related dummy variables against productivity_consistency
# for col in df_clean.filter(like='exercise_').columns:
#     f_val, p_val = stats.f_oneway(df_clean[df_clean[col] == 1]['productivity_consistency'],
#                                   df_clean[df_clean[col] == 0]['productivity_consistency'])
#     results.append({'Variable': col, 'Dependent Variable': 'productivity_consistency', 'F-value': f_val, 'p-value': p_val})

# # Convert results to a DataFrame and display
# results_df = pd.DataFrame(results)
# print("\nOne-way ANOVA Results:")
# results_df

One-way ANOVA for caffeine_consumption vs daily_productivity: F=0.88, p=0.3554
One-way ANOVA for caffeine_afternoon vs daily_productivity: F=1.90, p=0.1752
One-way ANOVA for caffeine_evening vs daily_productivity: F=4.94, p=0.0318
One-way ANOVA for caffeine_late_night vs daily_productivity: F=1.06, p=0.3081
One-way ANOVA for caffeine_morning vs daily_productivity: F=1.32, p=0.2576
One-way ANOVA for caffeine_never vs daily_productivity: F=0.81, p=0.3741
One-way ANOVA for exercise_freq vs daily_productivity: F=6.89, p=0.0393
One-way ANOVA for exercise_afternoon vs daily_productivity: F=0.49, p=0.4862
One-way ANOVA for exercise_early_morning vs daily_productivity: F=0.45, p=0.5069
One-way ANOVA for exercise_evening vs daily_productivity: F=0.12, p=0.7323
One-way ANOVA for exercise_morning vs daily_productivity: F=0.77, p=0.3867
One-way ANOVA for exercise_never vs daily_productivity: F=0.58, p=0.4512

 ########################################################################################

In [None]:
# Bin consistent_sleep into 3 categories
df_clean['consistent_sleep_bins'] = pd.cut(df_clean['consistent_sleep'], bins=[0, 2, 4, 5], labels=['1-2', '3-4', '5'])

# Re-check distributions
print("Updated Distribution of consistent_sleep_bins:")
print(df_clean['consistent_sleep_bins'].value_counts(dropna=False))

# Conduct one-way ANOVA for consistent_sleep_bins vs daily_productivity
f_val, p_val = stats.f_oneway(*[group['daily_productivity'] for name, group in df_clean.groupby('consistent_sleep_bins')])
print(f"One-way ANOVA for consistent_sleep_bins vs daily_productivity: F={f_val:.2f}, p={p_val:.4f}")

# Conduct one-way ANOVA for consistent_sleep_bins vs productivity_consistency
f_val, p_val = stats.f_oneway(*[group['productivity_consistency'] for name, group in df_clean.groupby('consistent_sleep_bins')])
print(f"One-way ANOVA for consistent_sleep_bins vs productivity_consistency: F={f_val:.2f}, p={p_val:.4f}")

# # Bin sleep_hours into 4 categories
# df_clean['sleep_hours_bins'] = pd.cut(df_clean['sleep_hours'], bins=[0, 4, 8, 12, 16], labels=['0-4', '5-8', '9-12', '13-16'])

# # Conduct one-way ANOVA for sleep_hours_bins vs daily_productivity
# f_val, p_val = stats.f_oneway(*[group['daily_productivity'] for name, group in df_clean.groupby('sleep_hours_bins')])
# print(f"One-way ANOVA for sleep_hours_bins vs daily_productivity: F={f_val:.2f}, p={p_val:.4f}")

# # Conduct one-way ANOVA for sleep_hours_bins vs productivity_consistency
# f_val, p_val = stats.f_oneway(*[group['productivity_consistency'] for name, group in df_clean.groupby('sleep_hours_bins')])
# print(f"One-way ANOVA for sleep_hours_bins vs productivity_consistency: F={f_val:.2f}, p={p_val:.4f}")

# Merge bins to avoid empty groups
df_clean['sleep_hours_bins'] = pd.cut(df_clean['sleep_hours'], bins=[0, 6, 8, 16], labels=['0-6', '6-9', '10-16'])

# Re-check distributions
print("Updated Distribution of sleep_hours_bins:")
print(df_clean['sleep_hours_bins'].value_counts(dropna=False))

# Conduct one-way ANOVA for updated bins
try:
    f_val, p_val = stats.f_oneway(*[group['daily_productivity'] for _, group in df_clean.groupby('sleep_hours_bins', observed=False)])
    print(f"One-way ANOVA for updated sleep_hours_bins vs daily_productivity: F={f_val:.2f}, p={p_val:.4f}")
except ValueError as e:
    print("Error with updated sleep_hours_bins vs daily_productivity:", e)

try:
    f_val, p_val = stats.f_oneway(*[group['productivity_consistency'] for _, group in df_clean.groupby('sleep_hours_bins', observed=False)])
    print(f"One-way ANOVA for updated sleep_hours_bins vs productivity_consistency: F={f_val:.2f}, p={p_val:.4f}")
except ValueError as e:
    print("Error with updated sleep_hours_bins vs productivity_consistency:", e)



Updated Distribution of consistent_sleep_bins:
consistent_sleep_bins
3-4    28
1-2    13
5       3
Name: count, dtype: int64
One-way ANOVA for consistent_sleep_bins vs daily_productivity: F=2.23, p=0.1203
One-way ANOVA for consistent_sleep_bins vs productivity_consistency: F=1.98, p=0.1514
Updated Distribution of sleep_hours_bins:
sleep_hours_bins
6-9      31
0-6       9
10-16     4
Name: count, dtype: int64
One-way ANOVA for updated sleep_hours_bins vs daily_productivity: F=4.24, p=0.0212
One-way ANOVA for updated sleep_hours_bins vs productivity_consistency: F=0.49, p=0.6171








### 1. **One-way ANOVA for Daily Productivity**
- **Caffeine Consumption**:
  - **F = 0.88**, **p = 0.3554**: No significant relationship between overall caffeine consumption and daily productivity (p > 0.05).
- **Caffeine by Time of Day**:
  - **Afternoon**: **F = 1.90**, **p = 0.1752**: No significant effect on productivity.
  - **Evening**: **F = 4.94**, **p = 0.0318**: Significant relationship; caffeine consumption in the evening appears to influence daily productivity (p < 0.05).
  - **Late Night**: **F = 1.06**, **p = 0.3081**: No significant effect.
  - **Morning**: **F = 1.32**, **p = 0.2576**: No significant effect.
  - **Never**: **F = 0.81**, **p = 0.3741**: No significant effect.

- **Exercise Frequency**:
  - **F = 6.89**, **p = 0.0393**: Significant effect; exercise frequency has a notable impact on daily productivity (p < 0.05).
- **Exercise by Time of Day**:
  - **Afternoon**: **F = 0.49**, **p = 0.4862**: No significant effect.
  - **Early Morning**: **F = 0.45**, **p = 0.5069**: No significant effect.
  - **Evening**: **F = 0.12**, **p = 0.7323**: No significant effect.
  - **Morning**: **F = 0.77**, **p = 0.3867**: No significant effect.
  - **Never**: **F = 0.58**, **p = 0.4512**: No significant effect.

### 2. **One-way ANOVA for Productivity Consistency**
- **Caffeine Consumption**:
  - **F = 0.31**, **p = 0.5813**: No significant relationship between overall caffeine consumption and productivity consistency.
- **Caffeine by Time of Day**:
  - **Afternoon**: **F = 2.46**, **p = 0.1244**: No significant effect.
  - **Evening**: **F = 0.00**, **p = 0.9795**: No significant effect.
  - **Late Night**: **F = 0.13**, **p = 0.7211**: No significant effect.
  - **Morning**: **F = 0.27**, **p = 0.6053**: No significant effect.
  - **Never**: **F = 0.79**, **p = 0.3779**: No significant effect.

- **Exercise Frequency**:
  - **F = 0.47**, **p = 0.5191**: No significant effect.
- **Exercise by Time of Day**:
  - **Afternoon**: **F = 2.50**, **p = 0.1215**: No significant effect.
  - **Early Morning**: **F = 1.07**, **p = 0.3078**: No significant effect.
  - **Evening**: **F = 0.08**, **p = 0.7774**: No significant effect.
  - **Morning**: **F = 0.35**, **p = 0.5574**: No significant effect.
  - **Never**: **F = 0.98**, **p = 0.3278**: No significant effect.

### **Summary of Findings**
- **Daily Productivity**:
  - Evening caffeine consumption and exercise frequency are significant predictors. Increased caffeine intake in the evening and more frequent exercise seem to contribute to higher daily productivity.
- **Productivity Consistency**:
  - None of the tested variables (caffeine or exercise frequency) significantly impact productivity consistency. This suggests that while daily productivity can be influenced by caffeine and exercise, these factors do not seem to consistently affect productivity over time.

### **Conclusions**
- If you are aiming to improve **daily productivity**, focusing on **evening caffeine consumption** and maintaining an **exercise routine** are key factors.
- For **productivity consistency**, other variables not tested here might play a more substantial role, as no significant impacts were found in this analysis.

### **Analysis of Updated Results**

#### **Consistent Sleep Bins**
1. **Distribution**:  
   - **3-4 hours**: 28 participants  
   - **1-2 hours**: 13 participants  
   - **5 hours**: 3 participants  

   - **ANOVA Results**:
     - **Daily Productivity**: \( F = 2.23, p = 0.1203 \)  
     - **Productivity Consistency**: \( F = 1.98, p = 0.1514 \)  

   **Interpretation**:  
   The ANOVA for consistent sleep bins is not statistically significant for either dependent variable (\( p > 0.05 \)), indicating no strong evidence that sleep consistency (as categorized) has a substantial impact on daily productivity or productivity consistency.

#### **Sleep Hours Bins**
2. **Distribution**:  
   - **6-9 hours**: 31 participants  
   - **0-6 hours**: 9 participants  
   - **10-16 hours**: 4 participants  

   - **ANOVA Results**:
     - **Daily Productivity**: \( F = 4.24, p = 0.0212 \)  
     - **Productivity Consistency**: \( F = 0.49, p = 0.6171 \)  

   **Interpretation**:  
   - The effect of sleep hours is **statistically significant** on **daily productivity** (\( p < 0.05 \)), suggesting that productivity levels differ across sleep bins.  
   - However, there is **no significant effect** of sleep hours on **productivity consistency** (\( p > 0.05 \)).

---

### **Key Insights**
1. **Daily Productivity**:
   - Sleep hours (categorized into bins) significantly influence daily productivity. Participants sleeping 6-9 hours are likely to have higher productivity compared to those in other bins.
   - Consistent sleep hours (as categorized) do not appear to have a significant effect.

2. **Productivity Consistency**:
   - Neither sleep hours nor consistent sleep bins significantly affect productivity consistency in this analysis.

## 2-way ANOVA

In [None]:
df_clean.columns

Index(['age', 'work_study_hours', 'daily_productivity',
       'productivity_consistency', 'early_morning_prod', 'morning_prod',
       'afternoon_prod', 'evening_prod', 'late_night_prod',
       'caffeine_consumption', 'sleep_hours', 'consistent_sleep',
       'exercise_freq', 'energy_fluctuations', 'caffeine_afternoon',
       'caffeine_evening', 'caffeine_late_night', 'caffeine_morning',
       'caffeine_never', 'exercise_afternoon', 'exercise_early_morning',
       'exercise_evening', 'exercise_morning', 'exercise_never',
       'daily_productivity_cat', 'productivity_consistency_cat',
       'sleep_hours_bins', 'consistent_sleep_bins'],
      dtype='object')

In [None]:
# Ensure that categorical columns are properly encoded
caffeine_vars = df_clean.filter(like='caffeine_').columns
exercise_vars = df_clean.filter(like='exercise_').columns

# Convert caffeine and exercise columns to categorical data types if not already
df_clean[caffeine_vars] = df_clean[caffeine_vars].astype('category')
df_clean[exercise_vars] = df_clean[exercise_vars].astype('category')

# Two-way ANOVA for daily_productivity
formula_daily_productivity = 'daily_productivity ~ C(sleep_hours) + C(consistent_sleep) + C(caffeine_consumption)\
 + C(sleep_hours):C(consistent_sleep)\
 + C(caffeine_afternoon) + C(caffeine_evening) + C(caffeine_late_night) + C(caffeine_morning) + C(caffeine_never) \
 + C(exercise_afternoon) + C(exercise_early_morning) + C(exercise_evening) + C(exercise_morning) + C(exercise_never) +\
  C(caffeine_afternoon):C(exercise_afternoon) + C(caffeine_evening):C(exercise_evening) + C(caffeine_morning):C(exercise_morning) + C(caffeine_never):C(exercise_never)'
model_daily_productivity = ols(formula_daily_productivity, data=df_clean).fit()
anova_table_daily_productivity = sm.stats.anova_lm(model_daily_productivity, type=2)
print("\nTwo-way ANOVA for caffeine and exercise on daily_productivity:")
print(anova_table_daily_productivity)

# Two-way ANOVA for productivity_consistency
formula_productivity_consistency = 'productivity_consistency ~ C(sleep_hours) + C(consistent_sleep) + C(caffeine_consumption) \
+ C(sleep_hours):C(consistent_sleep)\
+ C(caffeine_afternoon) + C(caffeine_evening) + C(caffeine_late_night) + C(caffeine_morning) + C(caffeine_never) \
+ C(exercise_afternoon) + C(exercise_early_morning) + C(exercise_evening) + C(exercise_morning) + C(exercise_never) \
+ C(caffeine_afternoon):C(exercise_afternoon) + C(caffeine_evening):C(exercise_evening) + C(caffeine_morning):C(exercise_morning) + C(caffeine_never):C(exercise_never)'
model_productivity_consistency = ols(formula_productivity_consistency, data=df_clean).fit()
anova_table_productivity_consistency = sm.stats.anova_lm(model_productivity_consistency, type=2)
print("\nTwo-way ANOVA for caffeine and exercise on productivity_consistency:")
print(anova_table_productivity_consistency)


Two-way ANOVA for caffeine and exercise on daily_productivity:
                                               df    sum_sq   mean_sq  \
C(sleep_hours)                                4.0  7.858081  1.964520   
C(consistent_sleep)                           4.0  3.150347  0.787587   
C(caffeine_consumption)                       5.0  2.690913  0.538183   
C(caffeine_afternoon)                         1.0  0.184462  0.184462   
C(caffeine_evening)                           1.0  3.125912  3.125912   
C(caffeine_late_night)                        1.0  0.041441  0.041441   
C(caffeine_morning)                           1.0  0.082426  0.082426   
C(caffeine_never)                             1.0  0.001945  0.001945   
C(exercise_afternoon)                         1.0  0.077613  0.077613   
C(exercise_early_morning)                     1.0  0.037871  0.037871   
C(exercise_evening)                           1.0  0.378057  0.378057   
C(exercise_morning)                           1.0  0.205710 

### For **Daily Productivity**:
- **Main Effects**:
  - **Caffeine**:
    - **Evening caffeine** has a significant effect (F = 4.59, p = 0.040) on daily productivity, suggesting that caffeine intake in the evening is associated with changes in productivity.
    - Other caffeine types (e.g., morning, afternoon, late-night, and never) did not show significant effects (p > 0.05).
  - **Exercise**:
    - **Exercise frequency** (e.g., afternoon, early morning, evening, morning, and never) did not significantly impact daily productivity (all p-values > 0.05).

- **Interaction Effects**:
  - None of the interaction terms between caffeine and exercise showed significant effects on daily productivity (all p-values > 0.05). For instance, the interaction between caffeine intake in the afternoon and exercise in the afternoon (F = 1.78, p = 0.192) was not significant.

### For **Productivity Consistency**:
- **Main Effects**:
  - **Caffeine**:
    - **Caffeine intake in the afternoon** showed a trend toward significance (F = 2.19, p = 0.149), but it did not reach the standard level of significance (p < 0.05). Other types of caffeine (e.g., evening, late-night, morning, and never) did not significantly affect productivity consistency (p > 0.05).
  - **Exercise**:
    - **Exercise in the afternoon** showed a trend toward significance (F = 1.81, p = 0.188), suggesting a potential effect on productivity consistency. Other exercise times (early morning, evening, morning, and never) did not show significant effects (p > 0.05).

- **Interaction Effects**:
  - The interaction terms for caffeine and exercise (e.g., afternoon, evening, morning, and never) were not significant (all p-values > 0.05). This suggests that the combination of specific times for caffeine intake and exercise does not affect productivity consistency.

### General Interpretation:
- **Caffeine** appears to play a more prominent role in impacting **daily productivity**, particularly in the evening, while the effect on **productivity consistency** is not strong or significant.
- **Exercise** did not show significant effects on either **daily productivity** or **productivity consistency**.
- There are no significant **interaction effects** between caffeine and exercise, indicating that the combination of these factors does not contribute to productivity outcomes in a way that differs from their individual effects.

### Conclusion:
The analysis suggests that while **evening caffeine intake** has a significant effect on **daily productivity**, the impact of **exercise** on both productivity metrics is not statistically significant. The combination of caffeine and exercise does not have a noteworthy interactive effect on productivity outcomes.

## Chi-square contingency

In [None]:
# List of caffeine and exercise variables
caffeine_vars = df_clean.filter(like='caffeine_').columns
exercise_vars = df_clean.filter(like='exercise_').columns

# DataFrame to store results
results = []

# Function to run chi-square tests and store results
def chi_square_test(data, var1, var2):
    # Create a contingency table
    contingency_table = pd.crosstab(data[var1], data[var2])
    # Run chi-square test
    chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    return chi2_stat, p_value, dof, expected

# Run chi-square tests for all combinations and store in the results DataFrame
for col1 in caffeine_vars:
    for col2 in exercise_vars:
        chi2_stat, p_value, dof, expected = chi_square_test(df_clean, col1, col2)
        results.append({
            'Caffeine Variable': col1,
            'Exercise Variable': col2,
            'Chi2 Stat': round(chi2_stat, 2),
            'p-value': round(p_value, 4),
            'Degrees of Freedom': dof
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results).drop_duplicates().reset_index(drop=True)

# Display the results as a table
print("Chi-Square Test Results:")
results_df

Chi-Square Test Results:


Unnamed: 0,Caffeine Variable,Exercise Variable,Chi2 Stat,p-value,Degrees of Freedom
0,caffeine_consumption,exercise_freq,22.81,0.9439,35
1,caffeine_consumption,exercise_afternoon,12.72,0.0261,5
2,caffeine_consumption,exercise_early_morning,1.71,0.8871,5
3,caffeine_consumption,exercise_evening,5.34,0.3758,5
4,caffeine_consumption,exercise_morning,2.39,0.793,5
5,caffeine_consumption,exercise_never,0.91,0.9697,5
6,caffeine_afternoon,exercise_freq,6.52,0.4804,7
7,caffeine_afternoon,exercise_afternoon,0.06,0.8013,1
8,caffeine_afternoon,exercise_early_morning,0.0,1.0,1
9,caffeine_afternoon,exercise_evening,0.05,0.8284,1


In [None]:
# Assuming daily_productivity and productivity_consistency are categorized as 'low', 'medium', 'high'

# Convert daily_productivity and productivity_consistency to categorical variables (optional)
df_clean['daily_productivity_cat'] = pd.qcut(df_clean['daily_productivity'], q=3, labels=['low', 'medium', 'high'], duplicates='drop')
df_clean['productivity_consistency_cat'] = pd.cut(df_clean['productivity_consistency'], bins=3, labels=['low', 'medium', 'high'])

# List of variables to test against
productivity_vars = ['daily_productivity_cat', 'productivity_consistency_cat']

# Run chi-square tests for each productivity variable with caffeine and exercise variables
results = []

for prod_var in productivity_vars:
    for col1 in caffeine_vars:
        for col2 in exercise_vars:
            chi2_stat, p_value, dof, expected = chi_square_test(df_clean, prod_var, col1)
            results.append({
                'Productivity Variable': prod_var,
                'Variable': col1,
                'Chi2 Stat': round(chi2_stat, 2),
                'p-value': round(p_value, 4),
                'Degrees of Freedom': dof
            })
            chi2_stat, p_value, dof, expected = chi_square_test(df_clean, prod_var, col2)
            results.append({
                'Productivity Variable': prod_var,
                'Variable': col2,
                'Chi2 Stat': round(chi2_stat, 2),
                'p-value': round(p_value, 4),
                'Degrees of Freedom': dof
            })

# Convert results to a DataFrame
results_df = pd.DataFrame(results).drop_duplicates().reset_index(drop=True)

# Display the results
print("Chi-Square Test Results Related to Productivity Measures:")
results_df

Chi-Square Test Results Related to Productivity Measures:


Unnamed: 0,Productivity Variable,Variable,Chi2 Stat,p-value,Degrees of Freedom
0,daily_productivity_cat,caffeine_consumption,9.38,0.4962,10
1,daily_productivity_cat,exercise_freq,27.86,0.0149,14
2,daily_productivity_cat,exercise_afternoon,1.14,0.5657,2
3,daily_productivity_cat,exercise_early_morning,1.59,0.4525,2
4,daily_productivity_cat,exercise_evening,0.88,0.6435,2
5,daily_productivity_cat,exercise_morning,4.16,0.1251,2
6,daily_productivity_cat,exercise_never,0.51,0.7769,2
7,daily_productivity_cat,caffeine_afternoon,6.18,0.0455,2
8,daily_productivity_cat,caffeine_evening,4.54,0.1034,2
9,daily_productivity_cat,caffeine_late_night,0.53,0.7679,2


**Key Highlights of the Chi-Square Test Results**

- **Significant Associations**: The analysis revealed that exercise frequency was significantly associated with daily productivity categories, χ²(14) = 27.86, p = 0.0149. Additionally, caffeine consumption in the afternoon was significantly associated with daily productivity, χ²(2) = 6.18, p = 0.0455. An association was also found between afternoon exercise and productivity consistency, χ²(2) = 6.54, p = 0.0381.

- **Non-Significant Associations**: The relationship between caffeine consumption (general) and daily productivity was not significant, χ²(10) = 9.38, p = 0.4962. Similarly, variables such as early morning, evening, and morning exercise did not show significant associations with daily productivity or productivity consistency (p-values ranging from 0.1251 to 0.8861). Caffeine consumption at times other than afternoon also did not show significant associations with productivity measures (p-values from 0.1034 to 0.8089).

- **Interpretation**: The significant associations suggest that specific factors like exercise frequency and afternoon caffeine consumption may influence productivity measures and should be taken into account in related studies or productivity interventions. Non-significant results indicate that other factors, such as general caffeine consumption or exercise at different times, may not be as impactful or may require further exploration with larger sample sizes or additional methods.

# Overall Summary
1. **Regression Analysis**: The OLS regression for daily productivity showed a significant model (R² = 0.597, p < 0.001), indicating that factors like work/study hours and sleep hours significantly impact productivity.

2. **Regression Results for Productivity Consistency**: The OLS regression for productivity consistency had an R² of 0.541 (p < 0.001), highlighting energy fluctuations as a significant positive predictor.

3. **Chi-Square Analysis**: Chi-square tests revealed significant associations between exercise frequency and daily productivity (p = 0.0149), suggesting that higher exercise frequency correlates with increased productivity.

4. **Chi-Square Test for Caffeine Consumption**: The test for caffeine consumption and daily productivity was non-significant (p = 0.4962), implying no strong association between caffeine consumption and daily productivity.

5. **Energy Fluctuations**: The analysis indicated that energy fluctuations were positively correlated with both daily productivity (r = 0.543) and productivity consistency (r = 0.674), suggesting their critical role in productivity outcomes.

6. **Sleep Variables**: Sleep hours showed a negative correlation with daily productivity (r = -0.436), highlighting the potential impact of sleep quality on productivity levels.

7. **Exercise Frequency**: While exercise frequency correlated positively with productivity consistency (r = 0.361), it did not emerge as a significant predictor in regression models for daily productivity or productivity consistency.

8. **Age Factor**: Age demonstrated weak correlations with productivity variables, indicating that age might have a minimal direct influence on productivity and consistency (r < 0.2).

9. **Consistent Sleep**: The consistency of sleep was associated with higher daily productivity (r = 0.375) and productivity consistency (r = 0.311), suggesting its importance in sustaining productivity levels.

10. **Significant Predictors**: Work/study hours and energy fluctuations were identified as significant predictors in both regression models, emphasizing the importance of managing work hours and energy for enhancing productivity and consistency.

# Power Analysis

In [None]:
from statsmodels.stats.power import FTestPower

# Inputs
effect_size = 1.482  # From f^2 calculation
alpha = 0.05
sample_size = 37
num_predictors = 6  # Number of predictors

# Power calculation
power_analysis = FTestPower()
power = power_analysis.solve_power(effect_size=effect_size, df_num=num_predictors, df_denom=sample_size - num_predictors - 1, alpha=alpha)
print(f"Power for regression model: {power:.2f}")


Power for regression model: 0.55


# THE END