In [None]:
import pandas as pd
from datetime import datetime
import numpy as np


In [None]:
def convert_date(date_str):
    try:
        date_obj = datetime.strptime(str(date_str), '%a %b %d %H:%M:%S %z %Y')
        return date_obj.strftime('%Y-%m-%d %H:%M:%S')
    except (ValueError, TypeError):
        return np.nan

def convert_date_df(df):
    df['date'] = df['created_at'].apply(convert_date)
    df = df.dropna(subset=['date'])
    df['date'] = pd.to_datetime(df['date'])

    # Create 'year' and 'month' columns
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    return df


def filter_emotion(df):
    df = df[['screen_name','joy_pys','sadness_pys','anger_pys','surprise_pys','disgust_pys','fear_pys','date']]
    df['date'] = pd.to_datetime(df['date'])
    return df

def filter_emotion_all(df):
    df = df[['screen_name','joy','sadness','anger','surprise','disgust','fear','date']]
    df['date'] = pd.to_datetime(df['date'])
    return df
def avg_emotion_per_user(df):
    df.loc[:, 'date'] = pd.to_datetime(df['date'])

    df.set_index('date', inplace=True)
    # Resample DataFrame to weekly frequency and calculate mean for each user
    weekly_avg_per_user = df.groupby('screen_name').resample('M').mean()#w
    # print(weekly_avg_per_user)

    # Calculate the overall weekly average by taking the mean of the weekly averages per user
    # overall_weekly_avg = weekly_avg_per_user.mean(level='date')
    overall_weekly_avg = weekly_avg_per_user.groupby(level='date').mean()

    return overall_weekly_avg


def replace_column_emotion(df):
    df.columns = df.columns.str.replace('_pys', '', regex=False)
    return df


# Save Data

In [None]:
hcp_stats.to_pickle('Cohen_d/hcp_stats_cohend.pkl')

general_stats.to_pickle('Cohen_d/nonhcp_cohend.pkl')


# Load Data

In [None]:

hcp_stats = pd.read_csv("hcp_cohend.csv")
general_stats = pd.read_csv("nonhcp_cohend.csv")


In [None]:
# Formatting output to match the style of the image
def format_output(stats_df, group_name):
    for index, row in stats_df.iterrows():
        print(f"The top 5 themes for {group_name} in {row['phase']} were:")
        print(f"- Joy: 95% CI [{row['joy_95ci'][0]:.3f}, {row['joy_95ci'][1]:.3f}]")
        print(f"- Sadness: 95% CI [{row['sadness_95ci'][0]:.3f}, {row['sadness_95ci'][1]:.3f}]")
        print(f"- Anger: 95% CI [{row['anger_95ci'][0]:.3f}, {row['anger_95ci'][1]:.2f}]")
        print(f"- Surprise: 95% CI [{row['surprise_95ci'][0]:.3f}, {row['surprise_95ci'][1]:.3f}]")
        print(f"- Disgust: 95% CI [{row['disgust_95ci'][0]:.3f}, {row['disgust_95ci'][1]:.3f}]")
        print(f"- Fear: 95% CI [{row['fear_95ci'][0]:.3f}, {row['fear_95ci'][1]:.3f}]\n")

# Output the formatted statistics
format_output(hcp_stats, 'HCP Population')
format_output(general_stats, 'General Population')


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import t

# Function to calculate Cohen's d
def calculate_cohen_d(mean1, mean2, std1, std2, n1, n2):
    pooled_std = np.sqrt(((n1 - 1) * std1 ** 2 + (n2 - 1) * std2 ** 2) / (n1 + n2 - 2))
    d = (mean1 - mean2) / pooled_std
    return d

# Extract 'Before COVID Phases' data
hcp_before_data = hcp_stats[hcp_stats['phase'] == 'Before COVID Phases']
general_before_data = general_stats[general_stats['phase'] == 'Before COVID Phases']

# List of emotions to compute Cohen's d
emotions = ['sadness', 'anger', 'surprise', 'disgust', 'fear', 'joy']

# Compute Cohen's d for HCP populations
for emotion in emotions:
    hcp_stats[f'{emotion}_cohen_d'] = hcp_stats.apply(lambda row: calculate_cohen_d(
        row[f'{emotion}_mean'],
        hcp_before_data[f'{emotion}_mean'].mean(),
        row[f'{emotion}_std'],
        hcp_before_data[f'{emotion}_std'].mean(),
        row['unique_users'],
        hcp_before_data['unique_users'].sum()
    ) if row['phase'] != 'Before COVID Phases' else np.nan, axis=1)

# Compute Cohen's d for General populations
for emotion in emotions:
    general_stats[f'{emotion}_cohen_d'] = general_stats.apply(lambda row: calculate_cohen_d(
        row[f'{emotion}_mean'],
        general_before_data[f'{emotion}_mean'].mean(),
        row[f'{emotion}_std'],
        general_before_data[f'{emotion}_std'].mean(),
        row['unique_users'],
        general_before_data['unique_users'].sum()
    ) if row['phase'] != 'Before COVID Phases' else np.nan, axis=1)

# Function to format and print the results
def format_output(stats_df, group_name):
    for index, row in stats_df.iterrows():
        print(f"The top 5 themes for {group_name} in {row['phase']} were:")
        for emotion in emotions:
            cohen_d_value = row[f'{emotion}_cohen_d']
            ci_lower, ci_upper = row[f'{emotion}_95ci']
            print(f"- {emotion.capitalize()}: 95% CI [{ci_lower:.3f}, {ci_upper:.3f}], Cohen's d: {cohen_d_value:.3f}")
        print()

# Output the formatted statistics with Cohen's d
format_output(hcp_stats, 'HCP Population')
format_output(general_stats, 'General Population')


In [None]:
# Function to calculate Cohen's d and its 95% confidence interval
def calculate_cohen_d_and_ci(mean1, mean2, std1, std2, n1, n2, alpha=0.05):
    # Calculate Cohen's d
    pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
    d = (mean1 - mean2) / pooled_std

    # Calculate standard error
    se_d = np.sqrt((n1 + n2) / (n1 * n2) + d**2 / (2 * (n1 + n2)))

    # Critical value from t-distribution
    t_crit = t.ppf(1 - alpha / 2, df=n1 + n2 - 2)

    # Calculate confidence interval
    ci_lower = d - t_crit * se_d
    ci_upper = d + t_crit * se_d

    return d, (ci_lower, ci_upper)

# Example usage for HCP populations
for emotion in emotions:
    hcp_stats[f'{emotion}_cohen_d'] = hcp_stats.apply(lambda row: calculate_cohen_d_and_ci(
        row[f'{emotion}_mean'],
        hcp_before_data[f'{emotion}_mean'].mean(),
        row[f'{emotion}_std'],
        hcp_before_data[f'{emotion}_std'].mean(),
        row['unique_users'],
        hcp_before_data['unique_users'].sum()
    )[0] if row['phase'] != 'Before COVID Phases' else np.nan, axis=1)

    hcp_stats[f'{emotion}_cohen_d_ci'] = hcp_stats.apply(lambda row: calculate_cohen_d_and_ci(
        row[f'{emotion}_mean'],
        hcp_before_data[f'{emotion}_mean'].mean(),
        row[f'{emotion}_std'],
        hcp_before_data[f'{emotion}_std'].mean(),
        row['unique_users'],
        hcp_before_data['unique_users'].sum()
    )[1] if row['phase'] != 'Before COVID Phases' else (np.nan, np.nan), axis=1)


#########################################################################################################

for emotion in emotions:
    general_stats[f'{emotion}_cohen_d'] = general_stats.apply(lambda row: calculate_cohen_d_and_ci(
        row[f'{emotion}_mean'],
        general_before_data[f'{emotion}_mean'].mean(),
        row[f'{emotion}_std'],
        general_before_data[f'{emotion}_std'].mean(),
        row['unique_users'],
        general_before_data['unique_users'].sum()
    )[0] if row['phase'] != 'Before COVID Phases' else np.nan, axis=1)

    general_stats[f'{emotion}_cohen_d_ci'] = general_stats.apply(lambda row: calculate_cohen_d_and_ci(
        row[f'{emotion}_mean'],
        general_before_data[f'{emotion}_mean'].mean(),
        row[f'{emotion}_std'],
        general_before_data[f'{emotion}_std'].mean(),
        row['unique_users'],
        general_before_data['unique_users'].sum()
    )[1] if row['phase'] != 'Before COVID Phases' else (np.nan, np.nan), axis=1)



# Print results
for index, row in hcp_stats.iterrows():
    print(f"In {row['phase']}:")
    for emotion in emotions:
        cohen_d_value = row[f'{emotion}_cohen_d']
        ci_lower, ci_upper = row[f'{emotion}_cohen_d_ci']
        print(f"- {emotion.capitalize()}: Cohen's d = {cohen_d_value:.3f}, 95% CI = [{ci_lower:.3f}, {ci_upper:.3f}]")
    print()


print("------------------------------")
print("---------------genral---------------")
for index, row in general_stats.iterrows():
    print(f"In {row['phase']}:")
    for emotion in emotions:
        cohen_d_value = row[f'{emotion}_cohen_d']
        ci_lower, ci_upper = row[f'{emotion}_cohen_d_ci']
        print(f"- {emotion.capitalize()}: Cohen's d = {cohen_d_value:.3f}, 95% CI = [{ci_lower:.3f}, {ci_upper:.3f}]")
    print()


In [None]:
# Compute Cohen's d and CI between HCP and General populations for each phase and emotion
emotions = ['sadness', 'anger', 'surprise', 'disgust', 'fear', 'joy']
cohen_d_comparison = {}

# Iterate over each phase and emotion to compare HCP and General populations
for phase in hcp_stats['phase'].unique():
    for emotion in emotions:
        # Get data for the current phase for both populations
        hcp_data = hcp_stats[hcp_stats['phase'] == phase]
        general_data = general_stats[general_stats['phase'] == phase]

        if not hcp_data.empty and not general_data.empty:
            # Calculate mean and std for HCP and General data
            hcp_mean = hcp_data[f'{emotion}_mean'].mean()
            general_mean = general_data[f'{emotion}_mean'].mean()
            hcp_std = hcp_data[f'{emotion}_std'].mean()
            general_std = general_data[f'{emotion}_std'].mean()
            hcp_n = hcp_data['unique_users'].sum()
            general_n = general_data['unique_users'].sum()

            # Calculate Cohen's d and its CI
            cohen_d_value, ci = calculate_cohen_d_and_ci(
                hcp_mean,
                general_mean,
                hcp_std,
                general_std,
                hcp_n,
                general_n
            )
            cohen_d_comparison[(phase, emotion)] = (cohen_d_value, ci)

# Update format_output function to include Cohen's d and its CI
def format_output_comparison_with_ci(group_name_1, group_name_2, cohen_d_comparison):
    for (phase, emotion), (cohen_d_value, ci) in cohen_d_comparison.items():
        ci_lower, ci_upper = ci
        print(f"In {phase}, for {emotion.capitalize()}:")
        print(f"- Cohen's d ({group_name_1} vs {group_name_2}): {cohen_d_value:.3f}, 95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")
        print()

# Output the formatted statistics comparing HCP and General populations
format_output_comparison_with_ci('HCP Population', 'General Population', cohen_d_comparison)


In [None]:
import pandas as pd

# Create lists to hold the data for the Excel file
phase_list = []
emotion_list = []
group_list = []
cohen_d_values = []
ci_lower_values = []
ci_upper_values = []

# Iterate over HCP stats and collect data
for index, row in hcp_stats.iterrows():
    phase = row['phase']
    for emotion in emotions:
        cohen_d_value = row[f'{emotion}_cohen_d']
        ci_lower, ci_upper = row[f'{emotion}_cohen_d_ci']
        phase_list.append(phase)
        emotion_list.append(emotion)
        group_list.append('HCP Population')
        cohen_d_values.append(cohen_d_value)
        ci_lower_values.append(ci_lower)
        ci_upper_values.append(ci_upper)

# Iterate over General stats and collect data
for index, row in general_stats.iterrows():
    phase = row['phase']
    for emotion in emotions:
        cohen_d_value = row[f'{emotion}_cohen_d']
        ci_lower, ci_upper = row[f'{emotion}_cohen_d_ci']
        phase_list.append(phase)
        emotion_list.append(emotion)
        group_list.append('General Population')
        cohen_d_values.append(cohen_d_value)
        ci_lower_values.append(ci_lower)
        ci_upper_values.append(ci_upper)

# Iterate over comparison data
for (phase, emotion), (cohen_d_value, ci) in cohen_d_comparison.items():
    ci_lower, ci_upper = ci
    phase_list.append(phase)
    emotion_list.append(emotion)
    group_list.append('HCP vs General Population')
    cohen_d_values.append(cohen_d_value)
    ci_lower_values.append(ci_lower)
    ci_upper_values.append(ci_upper)

# Create a DataFrame
data = {
    "Phase": phase_list,
    "Emotion": emotion_list,
    "Group": group_list,
    "Cohen's d": cohen_d_values,
    "CI Lower": ci_lower_values,
    "CI Upper": ci_upper_values,
}
df = pd.DataFrame(data)

# Save to Excel
output_file_path = "cohen_d_full_results.xlsx"
df.to_excel(output_file_path, index=False)

output_file_path


In [None]:
# Compute Cohen's d and CI between HCP and General populations for each phase and emotion
emotions = ['sadness', 'anger', 'surprise', 'disgust', 'fear', 'joy']
cohen_d_comparison = {}

# Iterate over each phase and emotion to compare HCP and General populations
for phase in hcp_stats['phase'].unique():
    for emotion in emotions:
        # Get data for the current phase for both populations
        hcp_data = hcp_stats[hcp_stats['phase'] == phase]
        general_data = general_stats[general_stats['phase'] == phase]

        if not hcp_data.empty and not general_data.empty:
            # Calculate mean and std for HCP and General data
            hcp_mean = hcp_data[f'{emotion}_mean'].mean()
            general_mean = general_data[f'{emotion}_mean'].mean()
            hcp_std = hcp_data[f'{emotion}_std'].mean()
            general_std = general_data[f'{emotion}_std'].mean()
            hcp_n = hcp_data['unique_users'].sum()
            general_n = general_data['unique_users'].sum()

            # Calculate Cohen's d and its CI
            cohen_d_value, ci = calculate_cohen_d_and_ci(
                hcp_mean,
                general_mean,
                hcp_std,
                general_std,
                hcp_n,
                general_n
            )
            cohen_d_comparison[(phase, emotion)] = (cohen_d_value, ci)

# Update format_output function to include Cohen's d and its CI
def format_output_comparison_with_ci(group_name_1, group_name_2, cohen_d_comparison):
    for (phase, emotion), (cohen_d_value, ci) in cohen_d_comparison.items():
        ci_lower, ci_upper = ci
        print(f"In {phase}, for {emotion.capitalize()}:")
        print(f"- Cohen's d ({group_name_1} vs {group_name_2}): {cohen_d_value:.3f}, 95% CI: [{ci_lower:.3f}, {ci_upper:.3f}]")
        print()

# Output the formatted statistics comparing HCP and General populations
format_output_comparison_with_ci('HCP Population', 'General Population', cohen_d_comparison)
