In [23]:
#Load necessary libraries...
import graphviz
import semopy as sem
import pandas as pd
import numpy as np
from semopy import Model, Optimizer
import semopy.plot as semplot
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [24]:
# Load your data
df = pd.read_excel(r"C:\Users\pc726\OneDrive\Desktop\Final\survey_data.xlsx")

In [25]:
# Check the columns of the dataframe
df.columns

Index(['Timestamp', '1. Gender', '2. Age Group', '3. Qualification',
       '4. Professional Year ',
       '5. How would you rate your digital literacy skills?',
       '6. Which social media platforms do you use for academic purposes? (Select all that apply)',
       '7. How frequently do you use social media for academic purposes?',
       '8. What type of academic content do you access on social media? (Select all that apply)',
       '9. Do you think social media helps improve your academic performance?',
       '10. How does social media benefit your medical education? (Select all that apply)',
       '11. Do you think social media is more effective than traditional learning methods (books, lectures)?',
       '12. Why do you prefer social media over traditional sources? (Select all that apply)',
       '13. What motivates you to share academic content on social media? (Select all that apply)',
       '14. What challenges do you face while using social media for medical education

In [26]:
# ----------------------------
# DATA PREPROCESSING
# ----------------------------

# 1. SocialMediaUse construct
# Count platforms used
df['platform_count'] = df['6. Which social media platforms do you use for academic purposes? (Select all that apply)'].apply(
    lambda x: len(str(x).split(',')) if pd.notna(x) else 0
)

# Frequency of use encoding
freq_map = {'1-2 Hours': 1, '2-3 Hours': 2, '3-4 Hours': 3, 'More than four': 4}
df['frequency_encoded'] = df['7. How frequently do you use social media for academic purposes?'].map(freq_map)

# 2. DigitalLiteracy construct
digital_map = {'Beginner': 1, 'Intermediate': 2, 'Advanced': 3}
df['digital_literacy'] = df['5. How would you rate your digital literacy skills?'].map(digital_map)

# 3. AcademicBenefit construct
benefit_text = '10. How does social media benefit your medical education? (Select all that apply)'
df['helps_exam'] = df[benefit_text].apply(lambda x: 1 if 'Helps in exam preparation' in str(x) else 0)
df['expert_access'] = df[benefit_text].apply(lambda x: 1 if 'Provides access to expert knowledge' in str(x) else 0)
df['enhances_collab'] = df[benefit_text].apply(lambda x: 1 if 'Enhances collaboration with peers' in str(x) else 0)

# 4. MisinformationAwareness construct
df['aware_misinfo'] = df['15. Are you aware for misinformation and disinformation on social media? '].apply(
    lambda x: 1 if 'Yes' in str(x) else 0
)
df['active_verify'] = df['15. Are you aware for misinformation and disinformation on social media? '].apply(
    lambda x: 1 if 'actively verify' in str(x).lower() else 0
)

# 5. VerificationBehavior construct
verify_text = '16. How do you verify the credibility of academic content on social media? (select all that employ) '
df['checks_source'] = df[verify_text].apply(lambda x: 1 if 'Checking the source' in str(x) else 0)
df['compare_multiple'] = df[verify_text].apply(lambda x: 1 if 'Comparison information from multiple sources' in str(x) else 0)
df['peer_reviewed'] = df[verify_text].apply(lambda x: 1 if 'Looking for peer-reviewed refrences' in str(x) else 0)

# 6. TrainingInterest construct
training_map = {'Yes': 1, 'Maybe': 0.5, 'No': 0}
df['training_interest'] = df['18. Would you be interested in a training program on using social media for academic and medical learning?'].map(training_map)

# Create final dataset for SEM
sem_data = df[[
    'platform_count', 'frequency_encoded',
    'digital_literacy',
    'helps_exam', 'expert_access', 'enhances_collab',
    'aware_misinfo', 'active_verify',
    'checks_source', 'compare_multiple', 'peer_reviewed',
    'training_interest'
]].copy()

# Handle missing values
sem_data = sem_data.dropna()

# Standardize continuous variables (optional but recommended)
scaler = StandardScaler()
continuous_vars = ['platform_count', 'frequency_encoded', 'digital_literacy', 'training_interest']
sem_data[continuous_vars] = scaler.fit_transform(sem_data[continuous_vars])

print(f"Final dataset shape: {sem_data.shape}")
print("\nSample of preprocessed data:")
print(sem_data.head())

#save the preprocessed data to a new Excel file
sem_data.to_excel(r"C:\Users\pc726\OneDrive\Desktop\Final\sem_data.xlsx", index=False)



Final dataset shape: (427, 12)

Sample of preprocessed data:
   platform_count  frequency_encoded  digital_literacy  helps_exam  \
0       -0.599937           0.724091         -0.163155           0   
1       -0.599937          -0.204398         -0.163155           1   
2       -1.221717          -0.204398          1.457014           0   
3       -1.221717          -0.204398         -0.163155           1   
4       -1.221717          -0.204398         -0.163155           0   

   expert_access  enhances_collab  aware_misinfo  active_verify  \
0              1                0              1              0   
1              1                0              1              1   
2              0                0              1              1   
3              1                1              1              0   
4              1                0              1              1   

   checks_source  compare_multiple  peer_reviewed  training_interest  
0              1                 0          

In [27]:
# STRUCTURAL EQUATION MODELING
model_spec = """
    # ===== MEASUREMENT MODEL =====
    SocialMediaUse =~ platform_count + frequency_encoded
    DigitalLiteracy =~ digital_literacy
    TrainingInterest =~ training_interest
    AcademicBenefit =~ helps_exam + expert_access + enhances_collab
    MisinformationAwareness =~ aware_misinfo + active_verify
    VerificationBehavior =~ checks_source

    # ===== STRUCTURAL MODEL (SIMPLIFIED) =====
    # Only keeping significant and theoretically meaningful paths
    AcademicBenefit ~ DigitalLiteracy
    VerificationBehavior ~ MisinformationAwareness
    MisinformationAwareness ~ SocialMediaUse

    # ===== COVARIANCES (ADDITIONAL FOR BETTER FIT) =====
    # Among exogenous variables
    DigitalLiteracy ~~ SocialMediaUse
    DigitalLiteracy ~~ TrainingInterest
    SocialMediaUse ~~ TrainingInterest
    
    # Among indicator residuals (based on modification indices)
    helps_exam ~~ enhances_collab
    expert_access ~~ enhances_collab
    aware_misinfo ~~ active_verify
    
    # Additional error covariances for better fit
    helps_exam ~~ expert_access
    platform_count ~~ frequency_encoded
    digital_literacy ~~ training_interest
    
    # Residual covariances for endogenous variables
    AcademicBenefit ~~ TrainingInterest
    AcademicBenefit ~~ SocialMediaUse
    MisinformationAwareness ~~ DigitalLiteracy
"""

# Fit the model
model = Model(model_spec)
result = model.fit(sem_data)

In [28]:
# ----------------------------
# HYPOTHESIS TESTING
# ----------------------------

print("\n" + "="*60)
print("📊 HYPOTHESIS TESTING RESULTS")
print("="*60)

# Clean and filter estimates
estimates = model.inspect(std_est=True)
estimates['p-value'] = pd.to_numeric(estimates['p-value'], errors='coerce')
sig_paths = estimates[(estimates['op'] == '~') & (estimates['p-value'] < 0.05)]

# Define hypothesis mapping
hypothesis_map = {
    'AcademicBenefit ~ SocialMediaUse': 'H2',
    'AcademicBenefit ~ TrainingInterest': 'H3',
    'MisinformationAwareness ~ SocialMediaUse': 'H4',
    'VerificationBehavior ~ MisinformationAwareness': 'H5',
}

# Track supported hypotheses
supported_hypotheses = set()

# Display significant paths
for idx, row in sig_paths.iterrows():
    path = f"{row['lval']} ~ {row['rval']}"
    effect = "Positive" if row['Estimate'] > 0 else "Negative"
    beta = row['Estimate']
    pval = row['p-value']
    hypothesis = hypothesis_map.get(path, "—")

    # Mark supported hypotheses
    if hypothesis != "—":
        supported_hypotheses.add(hypothesis)

    print(f"{path}: {effect} effect (β = {beta:.3f}, p = {pval:.3f}) → Supports {hypothesis}")

# ----------------------------
# Summary of Hypothesis Support
# ----------------------------

print("\n" + "="*60)
print("✅ SUMMARY OF HYPOTHESIS VALIDATION")
print("="*60)

all_hypotheses = set(hypothesis_map.values())
for h in sorted(all_hypotheses):
    status = "✓ Supported" if h in supported_hypotheses else "✗ Not Supported"
    print(f"{h}: {status}")




📊 HYPOTHESIS TESTING RESULTS
expert_access ~ AcademicBenefit: Positive effect (β = 1.149, p = 0.017) → Supports —
enhances_collab ~ AcademicBenefit: Positive effect (β = 2.526, p = 0.003) → Supports —
active_verify ~ MisinformationAwareness: Positive effect (β = 5.412, p = 0.008) → Supports —

✅ SUMMARY OF HYPOTHESIS VALIDATION
H2: ✗ Not Supported
H3: ✗ Not Supported
H4: ✗ Not Supported
H5: ✗ Not Supported


In [29]:
# ----------------------------
# H1: GROUP-WISE DIFFERENCE TESTING
# ----------------------------

print("\n" + "="*50)
print("H1: GROUP-WISE DIFFERENCE TESTING")
print("="*50)

# Gender-based difference in SocialMediaUse
if 'Gender' in df.columns:
    from scipy.stats import ttest_ind
    male = sem_data[df['Gender'] == 'Male']['platform_count']
    female = sem_data[df['Gender'] == 'Female']['platform_count']
    t_stat, p_val = ttest_ind(male, female, nan_policy='omit')
    print(f"Gender difference in platform_count: t = {t_stat:.3f}, p = {p_val:.3f}")
    if p_val < 0.05:
        print("✓ Significant gender-wise difference → Supports H1")
    else:
        print("✗ No significant gender-wise difference → Does not support H1")

# Age-wise difference (optional)
if 'Age' in df.columns:
    import statsmodels.api as sm
    model_age = sm.OLS(sem_data['platform_count'], sm.add_constant(df['Age'].astype(float))).fit()
    age_p = model_age.pvalues[1]
    print(f"\nAge effect on platform_count: p = {age_p:.3f}")
    if age_p < 0.05:
        print("✓ Significant age-wise difference → Supports H1")
    else:
        print("✗ No significant age-wise difference → Does not support H1")

# ----------------------------
# Summary of Hypothesis Results
# ----------------------------

print("\n" + "="*50)
print("SUMMARY OF HYPOTHESIS RESULTS")
print("="*50)

supported = set(hypothesis_map[path] for path in sig_paths['lval'] + ' ~ ' + sig_paths['rval'] if path in hypothesis_map)
all_hypotheses = {'H1', 'H2', 'H3', 'H4', 'H5'}

for h in sorted(all_hypotheses):
    if h in supported:
        print(f"{h}: ✓ Supported")
    else:
        print(f"{h}: ✗ Not Supported")

# ----------------------------
# Model Modification Suggestions
# ----------------------------

print("\n" + "="*50)
print("MODEL MODIFICATION SUGGESTIONS")
print("="*50)
print("If model fit is poor, consider:")
print("1. Adding error covariances between related indicators")
print("2. Removing non-significant paths")
print("3. Adding direct effects based on modification indices")
print("4. Trying different estimation methods (ML, WLS, etc.)")

# Display top modification indices
try:
    mi = model.modification_indices(sem_data)
    print("\nTop modification indices (MI > 3.84):")
    print(mi[mi['mi'] > 3.84].sort_values('mi', ascending=False).head(10))
except Exception as e:
    print(f"Could not compute modification indices: {e}")

# ----------------------------
# Save Results
# ----------------------------

try:
    estimates.to_csv('sem_parameter_estimates.csv', index=False)
    print("\n✓ Parameter estimates saved to 'sem_parameter_estimates.csv'")
except Exception as e:
    print(f"✗ Could not save parameter estimates: {e}")

print("\n🎉 SEM analysis and hypothesis testing completed successfully!")


H1: GROUP-WISE DIFFERENCE TESTING

SUMMARY OF HYPOTHESIS RESULTS
H1: ✗ Not Supported
H2: ✗ Not Supported
H3: ✗ Not Supported
H4: ✗ Not Supported
H5: ✗ Not Supported

MODEL MODIFICATION SUGGESTIONS
If model fit is poor, consider:
1. Adding error covariances between related indicators
2. Removing non-significant paths
3. Adding direct effects based on modification indices
4. Trying different estimation methods (ML, WLS, etc.)
Could not compute modification indices: 'Model' object has no attribute 'modification_indices'

✓ Parameter estimates saved to 'sem_parameter_estimates.csv'

🎉 SEM analysis and hypothesis testing completed successfully!


In [30]:
# Model fit statistics
import semopy
stat=semopy.calc_stats(model)
print(stat)

       DoF  DoF Baseline       chi2  chi2 p-value  chi2 Baseline       CFI  \
Value   20            45  38.805763      0.007048     238.268209  0.902696   

            GFI      AGFI       NFI       TLI     RMSEA       AIC        BIC  \
Value  0.837134  0.633552  0.837134  0.781066  0.046981  69.81824  211.80568   

        LogLik  
Value  0.09088  


In [31]:
# Pretty print the statistics
import tabulate
print(tabulate.tabulate(stat, headers='keys', tablefmt='psql'))

+-------+-------+----------------+---------+----------------+-----------------+----------+----------+----------+----------+----------+-----------+---------+---------+----------+
|       |   DoF |   DoF Baseline |    chi2 |   chi2 p-value |   chi2 Baseline |      CFI |      GFI |     AGFI |      NFI |      TLI |     RMSEA |     AIC |     BIC |   LogLik |
|-------+-------+----------------+---------+----------------+-----------------+----------+----------+----------+----------+----------+-----------+---------+---------+----------|
| Value |    20 |             45 | 38.8058 |     0.00704801 |         238.268 | 0.902696 | 0.837134 | 0.633552 | 0.837134 | 0.781066 | 0.0469814 | 69.8182 | 211.806 |  0.09088 |
+-------+-------+----------------+---------+----------------+-----------------+----------+----------+----------+----------+----------+-----------+---------+---------+----------+


In [32]:
# Save statistics to CSV
pd.DataFrame(stat).to_csv('sem_fit_statistics.csv')

In [33]:
# Generate and save model diagram
import semopy
semopy.report(model, 'sem_report.html')
graph = semplot.semplot(model, "sem_model_diagram.png")

