In [None]:
# %% [markdown]
# # Design of Experiments: Identifying Critical Factors in Chemical Reactions
# 
# This notebook demonstrates the complete workflow for analyzing screening experiments:
# 1. Load your experimental data
# 2. Analyze results to identify critical factors
# 3. Visualize factor importance
# 4. Statistical significance testing

# %% [markdown]
# ## 1. Setup and Import Libraries

# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# %% [markdown]
# ## 2. Load Your Data
# 
# Your dataframe should have:
# - Columns for each input factor (e.g., Temperature, pH, Catalyst_Conc, etc.)
# - One column for the response variable (e.g., Yield, Conversion, Purity)
# - Each row represents one experimental run

# %%
# REPLACE THIS WITH YOUR DATA
# Option 1: Load from CSV
# df = pd.read_csv('your_experimental_data.csv')

# Option 2: Use your existing dataframe
# df = your_dataframe_name

# For demonstration, create a placeholder
# DELETE THIS and use your actual data
df = pd.DataFrame()  # Replace with your actual dataframe

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())

# %% [markdown]
# ## 3. Define Your Factors and Response
# 
# Specify which columns are your input factors and which is your response.

# %%
# MODIFY THESE LISTS TO MATCH YOUR DATA

# List all your input factor column names
factor_columns = [
    'Temperature',
    'pH', 
    'Catalyst_Conc',
    'Pressure',
    'Reaction_Time',
    'Stirring_Speed',
    'Substrate_Conc'
]

# Specify your response variable column name
response_column = 'Yield'

# Verify columns exist in your dataframe
print("Checking if columns exist in dataframe...")
for col in factor_columns + [response_column]:
    if col in df.columns:
        print(f"  ✓ {col}")
    else:
        print(f"  ✗ {col} - NOT FOUND! Please check column name.")

# %% [markdown]
# ## 4. Convert to Coded Values
# 
# Convert actual values to coded (-1, +1) values for analysis.
# This assumes your data uses two levels (low and high) for each factor.

# %%
def convert_to_coded(df, factor_columns):
    """
    Convert actual values to coded (-1, +1) values
    Assumes each factor has only 2 distinct values (low and high)
    """
    coded_df = df.copy()
    factor_levels = {}
    
    for col in factor_columns:
        unique_vals = sorted(df[col].unique())
        
        if len(unique_vals) != 2:
            print(f"WARNING: {col} has {len(unique_vals)} unique values: {unique_vals}")
            print(f"  DOE analysis requires exactly 2 levels per factor.")
            print(f"  Using min and max values for coding.")
            unique_vals = [df[col].min(), df[col].max()]
        
        factor_levels[col] = {'low': unique_vals[0], 'high': unique_vals[1]}
        
        # Create coded column
        coded_df[f'{col}_coded'] = df[col].map({
            unique_vals[0]: -1,
            unique_vals[1]: 1
        })
        
        print(f"{col}: Low={unique_vals[0]:.2f} → -1, High={unique_vals[1]:.2f} → +1")
    
    return coded_df, factor_levels

# Convert to coded values
df_coded, factor_levels = convert_to_coded(df, factor_columns)

print("\nCoded dataframe:")
print(df_coded.head())

# %% [markdown]
# ## 5. Extract Design Matrix and Response

# %%
# Create design matrix (coded values only)
coded_columns = [f'{col}_coded' for col in factor_columns]
X = df_coded[coded_columns].values
y = df_coded[response_column].values

print(f"Design matrix shape: {X.shape}")
print(f"Response vector shape: {y.shape}")
print(f"\nResponse statistics:")
print(f"  Mean: {np.mean(y):.2f}")
print(f"  Std Dev: {np.std(y):.2f}")
print(f"  Min: {np.min(y):.2f}")
print(f"  Max: {np.max(y):.2f}")

# %% [markdown]
# ## 6. Calculate Factor Effects

# %%
def calculate_effects(X, y, factor_names):
    """
    Calculate main effects for each factor
    Effect = average response at high level - average response at low level
    """
    effects = []
    
    for i in range(X.shape[1]):
        # Effect = average at high level - average at low level
        high_avg = np.mean(y[X[:, i] == 1])
        low_avg = np.mean(y[X[:, i] == -1])
        effect = high_avg - low_avg
        effects.append(effect)
    
    effects_df = pd.DataFrame({
        'Factor': factor_names,
        'Effect': effects,
        'Abs_Effect': np.abs(effects)
    })
    
    return effects_df.sort_values('Abs_Effect', ascending=False)

# Calculate effects
effects_df = calculate_effects(X, y, factor_columns)

print("\n" + "="*60)
print("FACTOR EFFECTS (sorted by importance)")
print("="*60)
print(effects_df.to_string(index=False))
print("\nPositive effect = factor increases response when going from low to high")
print("Negative effect = factor decreases response when going from low to high")

# %% [markdown]
# ## 7. Statistical Analysis with Linear Regression

# %%
# Fit linear regression model
model = LinearRegression()
model.fit(X, y)

# Get coefficients (half-effects)
coefficients = model.coef_
effects_regression = coefficients * 2  # Convert to full effects

# Calculate R² score
r2_score = model.score(X, y)
y_pred = model.predict(X)
residuals = y - y_pred

print(f"\nModel R² Score: {r2_score:.4f}")
print(f"Model explains {r2_score*100:.2f}% of variance in the response")

# Calculate standard errors and p-values
mse = np.sum(residuals**2) / (len(y) - len(coefficients) - 1)
X_with_intercept = np.column_stack([np.ones(len(X)), X])
cov_matrix = mse * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
se = np.sqrt(np.diag(cov_matrix)[1:])  # Skip intercept

# Calculate t-statistics and p-values
t_stats = coefficients / se
df = len(y) - len(coefficients) - 1
p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), df))

# Create results dataframe
results_df = pd.DataFrame({
    'Factor': factor_columns,
    'Effect': effects_regression,
    'Coefficient': coefficients,
    'Std_Error': se,
    't_statistic': t_stats,
    'p_value': p_values,
    'Significant': ['***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else 'ns' 
                    for p in p_values]
})
results_df = results_df.sort_values('Effect', key=abs, ascending=False)

print("\n" + "="*80)
print("STATISTICAL SIGNIFICANCE TESTING")
print("="*80)
print(results_df.to_string(index=False))
print("\nSignificance codes: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant")

# %% [markdown]
# ## 8. Visualization: Pareto Chart of Effects

# %%
fig, ax = plt.subplots(figsize=(12, 6))

# Sort by absolute effect
sorted_results = results_df.sort_values('Effect', key=abs, ascending=True)

# Color bars based on significance
colors = ['#d62728' if sig in ['***', '**', '*'] else '#d3d3d3' 
          for sig in sorted_results['Significant']]

# Create horizontal bar chart
bars = ax.barh(range(len(sorted_results)), sorted_results['Effect'], color=colors)

# Add significance markers
for i, (effect, sig) in enumerate(zip(sorted_results['Effect'], sorted_results['Significant'])):
    ax.text(effect + (0.5 if effect > 0 else -0.5), i, sig, 
            va='center', ha='left' if effect > 0 else 'right', 
            fontweight='bold', fontsize=10)

ax.set_yticks(range(len(sorted_results)))
ax.set_yticklabels(sorted_results['Factor'], fontsize=11)
ax.set_xlabel(f'Effect on {response_column}', fontsize=13, fontweight='bold')
ax.set_title(f'Pareto Chart: Factor Effects on {response_column}', 
             fontsize=15, fontweight='bold', pad=20)
ax.axvline(x=0, color='black', linewidth=1.5)
ax.grid(axis='x', alpha=0.3)

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#d62728', label='Significant (p < 0.05)'),
    Patch(facecolor='#d3d3d3', label='Not Significant')
]
ax.legend(handles=legend_elements, loc='best', fontsize=10)

plt.tight_layout()
plt.show()

# %% [markdown]
# ## 9. Normal Probability Plot of Effects
# 
# Effects that fall off the line are likely real effects, not random noise.

# %%
fig, ax = plt.subplots(figsize=(10, 6))

effects_array = results_df['Effect'].values
stats.probplot(effects_array, dist="norm", plot=ax)

# Highlight significant factors
for effect, factor, sig in zip(results_df['Effect'], 
                               results_df['Factor'], 
                               results_df['Significant']):
    if sig in ['***', '**', '*']:
        theoretical_quantiles = stats.norm.ppf((np.arange(1, len(effects_array) + 1) - 0.5) / len(effects_array))
        sorted_effects = np.sort(effects_array)
        idx = np.where(sorted_effects == effect)[0][0]
        ax.plot(theoretical_quantiles[idx], effect, 'ro', markersize=12, zorder=5)
        ax.annotate(factor, (theoretical_quantiles[idx], effect), 
                   xytext=(15, 10), textcoords='offset points',
                   fontsize=10, fontweight='bold',
                   bbox=dict(boxstyle='round,pad=0.4', facecolor='yellow', alpha=0.8),
                   arrowprops=dict(arrowstyle='->', color='red', lw=1.5))

ax.set_title('Normal Probability Plot of Effects', fontsize=15, fontweight='bold', pad=20)
ax.set_xlabel('Theoretical Quantiles', fontsize=12)
ax.set_ylabel('Effect', fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nInterpretation: Points far from the line represent real effects.")
print("Points on the line are likely due to random noise.")

# %% [markdown]
# ## 10. Interaction Plot for Top 2 Factors
# 
# Check for potential interactions between the two most important factors.

# %%
# Create interaction plot for top 2 factors
top_factors = results_df.head(2)['Factor'].values

if len(top_factors) >= 2:
    factor1_idx = factor_columns.index(top_factors[0])
    factor2_idx = factor_columns.index(top_factors[1])
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Group by factor levels
    for f2_level in [-1, 1]:
        mask = X[:, factor2_idx] == f2_level
        f1_levels = X[mask, factor1_idx]
        y_values = y[mask]
        
        # Calculate means
        low_mean = np.mean(y_values[f1_levels == -1])
        high_mean = np.mean(y_values[f1_levels == 1])
        
        label = f"{top_factors[1]}: {'High' if f2_level == 1 else 'Low'}"
        ax.plot([-1, 1], [low_mean, high_mean], 'o-', linewidth=3, 
                markersize=12, label=label)
    
    ax.set_xticks([-1, 1])
    ax.set_xticklabels(['Low', 'High'], fontsize=11)
    ax.set_xlabel(top_factors[0], fontsize=13, fontweight='bold')
    ax.set_ylabel(f'Mean {response_column}', fontsize=13, fontweight='bold')
    ax.set_title(f'Interaction Plot: {top_factors[0]} × {top_factors[1]}', 
                 fontsize=15, fontweight='bold', pad=20)
    ax.legend(fontsize=11, loc='best')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nInterpretation:")
    print("- Parallel lines = no interaction (factors act independently)")
    print("- Non-parallel lines = interaction present (effect of one factor depends on the other)")

# %% [markdown]
# ## 11. Predicted vs Actual Plot

# %%
fig, ax = plt.subplots(figsize=(8, 8))

ax.scatter(y, y_pred, alpha=0.7, s=100, edgecolors='black', linewidth=1)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2, label='Perfect Prediction')

ax.set_xlabel(f'Actual {response_column}', fontsize=13, fontweight='bold')
ax.set_ylabel(f'Predicted {response_column}', fontsize=13, fontweight='bold')
ax.set_title('Predicted vs Actual Values', fontsize=15, fontweight='bold', pad=20)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

# Add R² annotation
ax.text(0.05, 0.95, f'R² = {r2_score:.4f}', 
        transform=ax.transAxes, fontsize=12, fontweight='bold',
        verticalalignment='top',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

plt.tight_layout()
plt.show()

# %% [markdown]
# ## 12. Residual Analysis

# %%
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residuals vs Predicted
axes[0].scatter(y_pred, residuals, alpha=0.7, s=100, edgecolors='black', linewidth=1)
axes[0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[0].set_xlabel(f'Predicted {response_column}', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Residuals', fontsize=12, fontweight='bold')
axes[0].set_title('Residual Plot', fontsize=13, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Q-Q plot of residuals
stats.probplot(residuals, dist="norm", plot=axes[1])
axes[1].set_title('Normal Q-Q Plot of Residuals', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nResidual Analysis:")
print("- Random scatter around zero = good model fit")
print("- Pattern in residuals = model may be missing important terms")
print("- Q-Q plot on line = residuals are normally distributed (good)")

# %% [markdown]
# ## 13. Summary and Conclusions

# %%
print("\n" + "="*80)
print("SUMMARY: CRITICAL FACTORS IDENTIFIED")
print("="*80)

critical_factors = results_df[results_df['Significant'].isin(['***', '**', '*'])]
non_critical = results_df[results_df['Significant'] == 'ns']

print(f"\n✓ Model Quality: R² = {r2_score:.4f} ({r2_score*100:.1f}% variance explained)")
print(f"\n✓ Number of Critical Factors: {len(critical_factors)} out of {len(factor_columns)}")

if len(critical_factors) > 0:
    print("\n✓ CRITICAL FACTORS (statistically significant, p < 0.05):")
    print("-" * 80)
    for idx, row in critical_factors.iterrows():
        direction = "increases" if row['Effect'] > 0 else "decreases"
        low_val = factor_levels[row['Factor']]['low']
        high_val = factor_levels[row['Factor']]['high']
        
        print(f"\n  {row['Factor']} {row['Significant']}")
        print(f"    • Effect: {row['Effect']:.3f} (p-value: {row['p_value']:.4f})")
        print(f"    • Going from {low_val:.2f} to {high_val:.2f} {direction} {response_column} by {abs(row['Effect']):.2f}")
        print(f"    • Recommendation: {'MAXIMIZE' if row['Effect'] > 0 else 'MINIMIZE'} this factor")

if len(non_critical) > 0:
    print("\n✓ NON-CRITICAL FACTORS (not statistically significant):")
    print("-" * 80)
    for idx, row in non_critical.iterrows():
        print(f"  • {row['Factor']}: Effect = {row['Effect']:.3f}, p-value = {row['p_value']:.4f}")
        print(f"    → Can be set at convenient/economical level")

print("\n" + "="*80)
print("RECOMMENDATIONS FOR NEXT STEPS")
print("="*80)
print("\n1. FOCUS on critical factors for optimization")
print("2. Use Response Surface Methodology (RSM) for the critical factors")
print("3. Consider Box-Behnken or Central Composite Design")
print("4. FIX non-critical factors at convenient/economical levels")
print("5. INVESTIGATE interactions between critical factors if lines are non-parallel")

if r2_score < 0.7:
    print("\n⚠ WARNING: Low R² suggests:")
    print("  - Important factors may be missing from the design")
    print("  - Non-linear effects or interactions may be present")
    print("  - Experimental error may be high")

print("\n" + "="*80)

# %% [markdown]
# ## 14. Export Results

# %%
# Save results to CSV
results_df.to_csv('factor_analysis_results.csv', index=False)
print("\n✓ Results saved to 'factor_analysis_results.csv'")

# Save critical factors summary
with open('critical_factors_summary.txt', 'w') as f:
    f.write("CRITICAL FACTORS ANALYSIS\n")
    f.write("="*80 + "\n\n")
    f.write(f"Response Variable: {response_column}\n")
    f.write(f"Model R²: {r2_score:.4f}\n")
    f.write(f"Critical Factors: {len(critical_factors)}/{len(factor_columns)}\n\n")
    
    f.write("CRITICAL FACTORS:\n")
    f.write("-"*80 + "\n")
    for idx, row in critical_factors.iterrows():
        f.write(f"\n{row['Factor']} {row['Significant']}\n")
        f.write(f"  Effect: {row['Effect']:.3f}\n")
        f.write(f"  p-value: {row['p_value']:.4f}\n")

print("✓ Summary saved to 'critical_factors_summary.txt'")

# %%