In [None]:
#1.Write a Python program to perform a Z-test for comparing a sample mean to a known population mean and interpret the results
 import numpy as np
from scipy import stats

def z_test(sample, pop_mean, pop_std, alpha=0.05):
    """
    Performs a one-sample Z-test.

    Parameters:
        sample (array-like): Sample data
        pop_mean (float): Known population mean
        pop_std (float): Known population standard deviation
        alpha (float): Significance level (default 0.05)

    Returns:
        z_stat (float): Calculated Z statistic
        p_value (float): Two-tailed p-value
        interpretation (str): Interpretation of the test result
    """
    n = len(sample)
    sample_mean = np.mean(sample)

    # Calculate standard error
    se = pop_std / np.sqrt(n)



In [None]:
#2.Simulate random data to perform hypothesis testing and calculate the corresponding P-value using Python@
import numpy as np
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)

# Parameters for population
pop_mean = 50
pop_std = 10

# Simulate a sample of size n from a normal distribution
n = 30
sample = np.random.normal(loc=pop_mean, scale=pop_std, size=n)

# Assume we want to test if sample mean differs from 52 (null hypothesis mean)
test_mean = 52

# Calculate sample mean and standard error
sample_mean = np.mean(sample)
se = pop_std / np.sqrt(n)  # Using population std since this is Z-test

# Calculate Z statistic
z_stat = (sample_mean - test_mean) / se

# Calculate two-tailed p-value
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))

print(f"Sample Mean: {sample_mean:.2f}")
print(f"Z Statistic: {z_stat:.3f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
alpha = 0.05
if p_value < alpha:
    print("Reject null hypothesis: sample mean significantly differs from 52")
else:
    print("Fail to reject null hypothesis: no significant difference from 52")


In [None]:
#3.@ Implement a one-sample Z-test using Python to compare the sample mean with the population mean
import numpy as np
from scipy.stats import norm

def one_sample_z_test(sample, pop_mean, pop_std, alpha=0.05):
    n = len(sample)
    sample_mean = np.mean(sample)
    se = pop_std / np.sqrt(n)  # Standard error

    # Calculate Z statistic
    z_stat = (sample_mean - pop_mean) / se

    # Calculate two-tailed p-value
    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    # Decision
    if p_value < alpha:
        result = "Reject null hypothesis: sample mean is significantly different."
    else:
        result = "Fail to reject null hypothesis: no significant difference."

    return z_stat, p_value, result

# Example usage
sample_data = [101, 98, 105, 100, 102, 99, 97, 104]
population_mean = 100
population_std = 5  # known population std deviation

z, p, conclusion = one_sample_z_test(sample_data, population_mean, population_std)

print(f"Z statistic: {z:.3f}")
print(f"P-value: {p:.4f}")
print(conclusion)


In [None]:
#4.@ Perform a two-tailed Z-test using Python and visualize the decision region on a plot@
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

def two_tailed_z_test(sample, pop_mean, pop_std, alpha=0.05):
    n = len(sample)
    sample_mean = np.mean(sample)
    se = pop_std / np.sqrt(n)

    # Z statistic
    z_stat = (sample_mean - pop_mean) / se

    # Two-tailed p-value
    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    # Critical values for rejection region
    z_critical = norm.ppf(1 - alpha/2)

    # Plot the normal distribution curve
    x = np.linspace(-4, 4, 1000)
    y = norm.pdf(x)

    plt.figure(figsize=(10,6))
    plt.plot(x, y, label='Standard Normal Distribution')

    # Shade rejection regions
    plt.fill_between(x, 0, y, where=(x <= -z_critical), color='red', alpha=0.5, label='Reject Region (Left)')
    plt.fill_between(x, 0, y, where=(x >= z_critical), color='red', alpha=0.5, label='Reject Region (Right)')

    # Shade acceptance region
    plt.fill_between(x, 0, y, where=((x > -z_critical) & (x < z_critical)), color='green', alpha=0.3, label='Accept Region')

    # Plot the calculated Z statistic
    plt.axvline(z_stat, color='blue', linestyle='--', linewidth=2, label=f'Z statistic = {z_stat:.3f}')

    plt.title('Two-tailed Z-test: Decision Regions')
    plt.xlabel('Z value')
    plt.ylabel('Probability Density')
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
#5.Create a Python function that calculates and visualizes Type 1 and Type 2 errors during hypothesis testing
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

def plot_type1_type2_errors(pop_mean_null, pop_mean_alt, pop_std, sample_size, alpha=0.05):
    """
    Visualize Type I and Type II errors for a one-sample Z-test.

    Parameters:
        pop_mean_null (float): Mean under the null hypothesis H0
        pop_mean_alt (float): Mean under the alternative hypothesis H1
        pop_std (float): Population standard deviation (assumed known)
        sample_size (int): Number of observations in the sample
        alpha (float): Significance level (Type I error rate)
    """
    se = pop_std / np.sqrt(sample_size)

    # Critical Z value (two-tailed test)
    z_critical = norm.ppf(1 - alpha/2)

    # Critical sample mean boundaries for rejecting H0
    crit_low = pop_mean_null - z_critical * se
    crit_high = pop_mean_null + z_critical * se

    # X values for plotting
    x_min = min(pop_mean_null, pop_mean_alt) - 4*se
    x_max = max(pop_mean_null, pop_mean_alt) + 4*se
    x = np.linspace(x_min, x_max, 1000)

    # PDFs under H0 and H1
    pdf_null = norm.pdf(x, loc=pop_mean_null, scale=se)
    pdf_alt = norm.pdf(x, loc=pop_mean_alt, scale=se)

    plt.figure(figsize=(12,6))

    # Plot null hypothesis distribution
    plt.plot(x, pdf_null, label='Null Hypothesis (H0)', color='blue')
    # Plot alternative hypothesis distribution
    plt.plot(x, pdf_alt, label='Alternative Hypothesis (H1)', color='green')

    # Shade Type I error regions (reject H0 when true)
    plt.fill_between(x, 0, pdf_null, where=(x < crit_low) | (x > crit_high), color='red', alpha=0.3, label='Type I Error (α)')

    # Shade Type II error region (fail to reject H0 when false)
    plt.fill_between(x, 0, pdf_alt, where=(x >= crit_low) & (x <= crit_high), color='orange', alpha=0.3, label='Type II Error (β)')

    # Draw critical boundaries
    plt.axvline(crit_low, color='red', linestyle='--', label=f'Critical Values ±{z_critical:.2f}σ')
    plt.axvline(crit_high, color='red', linestyle='--')

    # Draw population means
    plt.axvline(pop_mean_null, color='blue', linestyle=':', label='Mean under H0')
    plt.axvline(pop_mean_alt, color='green', linestyle=':', label='Mean under H1')

    # Calculate Type II error (β)
    beta_low = norm.cdf(crit_high, loc=pop_mean_alt, scale=se) - norm.cdf(crit_low, loc=pop_mean_alt, scale=se)
    # Type I error is alpha by definition

    plt.title('Type I and Type II Errors in Hypothesis Testing')
    plt.xlabel('Sample Mean')
    plt.ylabel('Probability Density')
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f"Type I error rate (α): {alpha:.3f}")
    print(f"Type II error rate (β): {beta_low:.3f}")
    print(f"Power of test (1 - β): {1 - beta_low:.3f}")

# Example usage
plot_type1_type2_errors(
    pop_mean_null=100,      # Null hypothesis mean
    pop_mean_alt=105,       # Alternative hypothesis mean
    pop_std=15,             # Population std dev
    sample_size=30,
    alpha=0.05
)


In [None]:
#6.Write a Python program to perform an independent T-test and interpret the results
import numpy as np
from scipy.stats import ttest_ind

def independent_t_test(sample1, sample2, alpha=0.05):
    """
    Perform an independent two-sample T-test and interpret the results.

    Parameters:
        sample1 (array-like): First sample data
        sample2 (array-like): Second sample data
        alpha (float): Significance level (default 0.05)

    Returns:
        t_stat (float): T statistic
        p_value (float): p-value
        interpretation (str): Conclusion of the test
    """
    t_stat, p_value = ttest_ind(sample1, sample2, equal_var=False)  # Welch’s t-test

    if p_value < alpha:
        interpretation = "Reject null hypothesis: The two sample means are significantly different."
    else:
        interpretation = "Fail to reject null hypothesis: No significant difference between the sample means."

    return t_stat, p_value, interpretation

# Example usage
group1 = [23, 20, 22, 25, 30, 27]
group2 = [31, 35, 29, 32, 30, 28]

t_stat, p_val, result = independent_t_test(group1, group2)

print(f"T statistic: {t_stat:.3f}")
print(f"P-value: {p_val:.4f}")
print(f"Result: {result}")


In [None]:
#6.Perform a paired sample T-test using Python and visualize the comparison results
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel

def paired_sample_t_test(before, after, alpha=0.05):
    """
    Performs a paired sample t-test and visualizes results.

    Parameters:
        before (array-like): Sample data before treatment
        after (array-like): Sample data after treatment
        alpha (float): Significance level

    Returns:
        t_stat (float): t statistic
        p_value (float): p-value
        interpretation (str): Conclusion of the test
    """
    t_stat, p_value = ttest_rel(before, after)

    if p_value < alpha:
        interpretation = "Reject null hypothesis: Significant difference between paired samples."
    else:
        interpretation = "Fail to reject null hypothesis: No significant difference between paired samples."

    # Visualization
    plt.figure(figsize=(8,6))

    # Boxplot of before and after
    plt.boxplot([before, after], labels=['Before', 'After'])

    # Connect paired s


In [None]:
#8. Simulate data and perform both Z-test and T-test, then compare the results using Python
import numpy as np
from scipy.stats import norm, ttest_1samp

# Seed for reproducibility
np.random.seed(123)

# Population parameters (known)
pop_mean = 100
pop_std = 15

# Sample size and simulated data
n = 30
sample = np.random.normal(loc=pop_mean + 2, scale=pop_std, size=n)  # mean shifted by +2

# Hypothesized mean to test against
test_mean = pop_mean

# --- One-sample Z-test ---
sample_mean = np.mean(sample)
se_z = pop_std / np.sqrt(n)
z_stat = (sample_mean - test_mean) / se_z
p_value_z = 2 * (1 - norm.cdf(abs(z_stat)))

# --- One-sample T-test ---
t_stat, p_value_t = ttest_1samp(sample, test_mean)

# Results
print("Sample Mean:", sample_mean)
print("\n--- One-sample Z-test ---")
print(f"Z statistic: {z_stat:.4f}")
print(f"P-value: {p_value_z:.4f}")

print("\n--- One-sample T-test ---")
print(f"T statistic: {t_stat:.4f}")
print(f"P-value: {p_value_t:.4f}")

# Interpretation (alpha = 0.05)
alpha = 0.05
print("\nInterpretation:")
print("Z-test:", "Reject H0" if p_value_z < alpha else "Fail to reject H0")
print("T-test:", "Reject H0" if p_value_t < alpha else "Fail to reject H0")


In [None]:
#9.Write a Python function to calculate the confidence interval for a sample mean and explain its significance.
import numpy as np
from scipy.stats import norm, t

def confidence_interval(sample, confidence=0.95, pop_std=None):
    """
    Calculate the confidence interval for the sample mean.

    Parameters:
        sample (array-like): Sample data
        confidence (float): Confidence level (default 0.95)
        pop_std (float or None): Population std dev if known; if None, use sample std dev and t-distribution

    Returns:
        (lower_bound, upper_bound): Tuple with confidence interval bounds
    """
    n = len(sample)
    sample_mean = np.mean(sample)

    if pop_std is not None:
        # Use Z-distribution when population std dev is known
        se = pop_std / np.sqrt(n)
        z_crit = norm.ppf((1 + confidence) / 2)
        margin_error = z_crit * se
    else:
        # Use T-distribution when population std dev unknown
        sample_std = np.std(sample, ddof=1)
        se = sample_std / np.sqrt(n)
        t_crit = t.ppf((1 + confidence) / 2, df=n-1)
        margin_error = t_crit * se

    lower_bound = sample_mean - margin_error
    upper_bound = sample_mean + margin_error

    return lower_bound, upper_bound

# Example usage
data = [12, 15, 14, 16, 13, 14, 15, 16, 14, 13]
ci_lower, ci_upper = confidence_interval(data, confidence=0.95)

print(f"95% Confidence Interval: ({ci_lower:.2f}, {ci_upper:.2f})")


In [None]:
#10.Write a Python program to calculate the margin of error for a given confidence level using sample data
import numpy as np
from scipy.stats import norm, t

def margin_of_error(sample, confidence=0.95, pop_std=None):
    """
    Calculate the margin of error for a sample mean.

    Parameters:
        sample (array-like): Sample data
        confidence (float): Confidence level (default 0.95)
        pop_std (float or None): Population std dev if known; else use sample std dev

    Returns:
        float: Margin of error
    """
    n = len(sample)

    if pop_std is not None:
        # Use Z-distribution
        se


In [None]:
#11. Implement a Bayesian inference method using Bayes' Theorem in Python and explain the process
def bayes_theorem(prior, likelihood, evidence):
    """
    Calculate the posterior probability using Bayes' Theorem.

    Parameters:
        prior (float): Prior probability P(H)
        likelihood (float): Likelihood P(E|H)
        evidence (float): Total probability of evidence P(E)

    Returns:
        float: Posterior probability P(H|E)
    """
    posterior = (likelihood * prior) / evidence
    return posterior

# Example: Medical test scenario
# H = patient has the disease
# E = test result is positive

prior = 0.01               # P(H) = 1% prevalence of disease
likelihood = 0.9           # P(E|H) = 90% test sensitivity
false_positive_rate = 0.05 # P(E|not H) = 5% false positive rate

# Total probability of positive test result (evidence)
evidence = likelihood * prior + false_positive_rate * (1 - prior)

posterior = bayes_theorem(prior, likelihood, evidence)

print(f"Posterior probability (patient has disease given positive test): {posterior:.4f}")


In [None]:
#12.D Perform a Chi-square test for independence between two categorical variables in Python
import pandas as pd
from scipy.stats import chi2_contingency

# Sample data: Survey of people who prefer coffee or tea across age groups
data = {
    '18-29': [30, 20],
    '30-49': [40, 35],
    '50+': [20, 25]
}
# Rows = Beverage preference (Coffee, Tea)
# Columns = Age groups
contingency_table = pd.DataFrame(data, index=['Coffee', 'Tea'])

print("Contingency Table:")
print(contingency_table)

# Perform Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Display results
print(f"\nChi-square Statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of Freedom: {dof}")
print("\nExpected Frequencies:")
print(pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))

# Interpretation
alpha = 0.05
if p < alpha:
    print("\nConclusion: Reject null hypothesis — there is a significant association between age group and beverage preference.")
else:
    print("\nConclusion: Fail to reject null hypothesis — no significant association between age group and beverage preference.")


In [None]:
#13.Write a Python program to calculate the expected frequencies for a Chi-square test based on observed dataD
import pandas as pd
from scipy.stats import chi2_contingency

# Sample observed data: Preferences by gender
data = {
    'Like': [60, 40],
    'Dislike': [30, 70]
}
# Rows = Gender (Male, Female)
# Columns = Response (Like, Dislike)
observed = pd.DataFrame(data, index=['Male', 'Female'])

print("Observed Frequencies:")
print(observed)

# Calculate expected frequencies using chi2_contingency
chi2, p, dof, expected = chi2_contingency(observed)

# Convert expected frequencies to DataFrame for display
expected_df = pd.DataFrame(expected, index=observed.index, columns=observed.columns)

print("\nExpected Frequencies:")
print(expected_df.round(2))


In [None]:
#14.D Perform a goodness-of-fit test using Python to compare the observed data to an expected distribution
import numpy as np
from scipy.stats import chisquare

# Observed frequencies from a 6-sided die rolled 60 times
observed = np.array([8, 9, 10, 11, 12, 10])  # example data

# Expected frequencies for a fair die (60 rolls → each face expected ~10 times)
expected = np.full_like(observed, fill_value=np.sum(observed) / len(observed))

# Perform Chi-square goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Output
print("Observed Frequencies:", observed)
print("Expected Frequencies:", expected)
print(f"\nChi-square Statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
alpha = 0.05
if p_value < alpha:
    print("Conclusion: Reject null hypothesis — the observed distribution differs significantly from expected.")
else:
    print("Conclusion: Fail to reject null hypothesis — no significant difference from expected distribution.")


In [None]:
#15.Create a Python script to simulate and visualize the Chi-square distribution and discuss its characteristicsD
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2

def simulate_chi_square(df=5, num_samples=10000):
    """
    Simulate and visualize a Chi-square distribution.

    Parameters:
        df (int): Degrees of freedom
        num_samples (int): Number of samples to simulate
    """
    # Simulate Chi-square distributed data
    chi_square_data = np.random.chisquare(df, size=num_samples)

    # Create x values for PDF
    x = np.linspace(0, np.max(chi_square_data), 500)
    pdf = chi2.pdf(x, df)

    # Plot histogram and PDF
    plt.figure(figsize=(10, 6))
    plt.hist(chi_square_data, bins=50, density=True, alpha=0.6, color='skyblue', label='Simulated data')
    plt.plot(x, pdf, 'r-', label=f'Chi-square PDF (df={df})', linewidth=2)
    plt.title(f'Chi-square


In [None]:
#16.D Implement an F-test using Python to compare the variances of two random samples
import numpy as np
from scipy.stats import f

def f_test(sample1, sample2, alpha=0.05):
    """
    Perform an F-test to compare the variances of two samples.

    Parameters:
        sample1, sample2: array-like, the two samples to compare
        alpha: significance level (default 0.05)

    Returns:
        f_stat: F statistic
        p_value: two-tailed p-value
        conclusion: interpretation of the result
    """
    var1 = np.var(sample1, ddof=1)
    var2 = np.var(sample2, ddof=1)

    # Ensure F >= 1 for consistency
    if var1 > var2:
        f_stat = var1 / var2
        dfn, dfd = len(sample1) - 1, len(sample2) - 1
    else:
        f_stat = var2 / var1
        dfn, dfd = len(sample2) - 1, len(sample1) - 1

    # Two-tailed p-value
    p_value = 2 * min(
        f.cdf(f_stat, dfn, dfd),
        1 - f.cdf(f_stat, dfn, dfd)
    )

    # Interpretation
    conclusion = "Reject null hypothesis: variances are significantly different." if p_value < alpha else "Fail to reject null hypothesis: no significant difference in variances."

    return f_stat, p_value, conclusion

# Example usage: Simulate two samples
np.random.seed(42)
sample_a = np.random.normal(loc=10, scale=3, size=30)
sample_b = np.random.normal(loc=10, scale=5, size=30)

f_stat, p_val, result = f_test(sample_a, sample_b)

print(


In [None]:
#17. Write a Python program to perform an ANOVA test to compare means between multiple groups and interpret the resultsD
import numpy as np
from scipy.stats import f_oneway

# Simulate data for 3 groups (e.g., test scores for 3 different teaching methods)
np.random.seed(0)
group1 = np.random.normal(loc=70, scale=5, size=30)
group2 = np.random.normal(loc=75, scale=5, size=30)
group3 = np.random.normal(loc=80, scale=5, size=30)

# Perform one-way ANOVA
f_stat, p_value = f_oneway(group1, group2, group3)

# Print results
print("ANOVA Test Results:")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpreta


In [None]:
#18.Perform a one-way ANOVA test using Python to compare the means of different groups and plot the results
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

# Step 1: Simulate data for 3 groups
np.random.seed(1)
group_A = np.random.normal(loc=70, scale=5, size=30)
group_B = np.random.normal(loc=75, scale=5, size=30)
group_C = np.random.normal(loc=80, scale=5, size=30)

# S


In [None]:
#19.Write a Python function to check the assumptions (normality, independence, and equal variance) for ANOVA
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

def check_anova_assumptions(*groups):
    """
    Check ANOVA assumptions: normality, independence (informally), and homogeneity of variance.

    Parameters:
        *groups: Variable-length list of groups (arrays or lists)
    """
    print("Assumption Checks for ANOVA:\n")

    # Combine all groups into one array and labels
    data = np.concatenate(groups)
    labels = np.concatenate([[f"Group {i+1}"] * len(group) for i, group in enumerate(groups)])

    # 1. Normality (Shapiro-Wilk Test)
    print("1. Normality (Shapiro-Wilk Test for each group):")
    for i, group in enumerate(groups):
        stat, p = stats.shapiro(group)
        print(f"  Group {i+1}: W={stat:.4f}, p-value={p:.4f} {'(normal)' if p > 0.05 else '(not normal)'}")

    # 2. Homogeneity of Variances (Levene’s Test)
    print("\n2. Homogeneity of Variance (Levene’s Test):")
    levene_stat, levene_p = stats.levene(*groups)
    print(f"  Levene’s test: W={levene_stat:.4f}, p-value={levene_p:.4f} {'(equal variances)' if levene_p > 0.05 else '(unequal variances)'}")

    # 3. Independence (assumed from experimental design)
    print("\n3. Independence: Must be assumed from study design. Not testable with just data.\n")

    # Visualization (optional)
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=labels, y=data)
    plt.title("Boxplot of Groups (Check for


In [None]:
#20.D Perform a two-way ANOVA test using Python to study the interaction between two factors and visualize the results
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Simulate a dataset
np.random.seed(42)
data = pd.DataFrame({
    'Score': np.random.normal(75, 10, 90),
    'Teaching_Method': ['A'] * 30 + ['B'] * 30 + ['C'] * 30,
    'Study_Hours': (['Low'] * 15 + ['High'] * 15) * 3
})

# Introduce some effect
data.loc[(data['Teaching_Method'] == 'B') & (data['Study_Hours'] == 'High'), 'Score'] += 5
data.loc[(data['Teaching_Method'] == 'C') & (data['Study_Hours'] == 'High'), 'Score'] += 10

# Two-way ANOVA
model = ols('Score ~ C(Teaching_Method) * C(Study_Hours)', data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print("Two-Way ANOVA Results:\n")
print(anova_table)

# Visualization: Interaction Plot
plt.figure(figsize=(8, 6))
sns.pointplot(data=data, x='Teaching_Method', y='Score', hue='Study_Hours',
              dodge=True, markers=['o', 's'], capsize=.1, errwidth=1, palette='Set2')
plt.title("Interaction between Teaching Method and Study Hours")
plt.ylabel("Average Score")
plt.grid(True)
plt.show()


In [None]:
#21.Write a Python program to visualize the F-distribution and discuss its use in hypothesis testing
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import f

def plot_f_distribution(df1, df2, alpha=0.05):
    """
    Visualizes the F-distribution and marks the critical value for a given alpha.

    Parameters:
        df1 (int): Degrees of freedom numerator
        df2 (int): Degrees of freedom denominator
        alpha (float): Significance level
    """
    # Generate x values
    x = np.linspace(0, 5, 1000)
    y = f.pdf(x, df1, df2)

    # Critical value (right-tail test)
    critical_value = f.ppf(1 - alpha, df1, df2)

    # Plot the F-distribution
    plt.figure(figsize=(10, 6))
    plt.plot(x, y, 'b-', label=f'F-distributio


In [None]:
#22. Perform a one-way ANOVA test in Python and visualize the results with boxplots to compare group means
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import f_oneway
import seaborn as sns

# Simulate sample data for 3 groups
np.random.seed(42)
group1 = np.random.normal(loc=70, scale=5, size=30)
group2 = np.random.normal(loc=75, scale=5, size=30)
group3 = np.random.normal(loc=80, scale=5, size=30)

# Perform one-way ANOVA
f_stat, p_value = f_oneway(group1, group2, group3)

print("One-Way ANOVA Test Results")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Conclusion: At least one group mean is significantly different.")
else:
    print("Conclusion: No significant difference between group means.")

# Prepare data for visualization
df = pd.DataFrame({
    'Score': np.concatenate([group1, group2, group3]),
    'Group': ['Group 1'] * len(group1) + ['Group 2'] * len(group2) + ['Group 3'] * len(group3)
})

# Create boxplot to compare group distributions
plt.figure(figsize=(8, 6))
sns.boxplot(x='Group', y='Score', data=df)
plt.title('Boxplot of Scores by Group')
plt.ylabel('Scores')
plt.grid(True)
plt.show()


In [None]:
#23.Simulate random data from a normal distribution, then perform hypothesis testing to evaluate the means
import numpy as np
from scipy.stats import ttest_ind

# Step 1: Simulate data from two normal distributions
np.random.seed(123)
group1 = np.random.normal(loc=50, scale=10, size=40)  # mean=50, sd=10
group2 = np.random.normal(loc=55, scale=10, size=40)  # mean=55, sd=10

# Step 2: Perform independent two-sample t-test (assuming equal variances)
t_stat, p_value = ttest_ind(group1, group2, equal_var=True)

# Step 3: Print results
print("Two-Sample Independent t-test Results")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

# Step 4: Interpret the result at alpha = 0.05
alpha = 0.05
if p_value < alpha:
    print("Conclusion: Reject the null hypothesis — means are significantly different.")
else:
    print("Conclusion: Fail to reject the null hypothesis — no significant difference in means.")


In [None]:
#24.Simulate random data from a normal distribution, then perform hypothesis testing to evaluate the meansD
import numpy as np
from scipy.stats import ttest_1samp

# Parameters
np.random.seed(0)
population_mean = 50   # Known population mean
sample_size = 40
sample_mean_shift = 52  # True mean for simulated data (to test against population_mean)
sample_std = 10

# Step 1: Simulate sample data
sample_data = np.random.normal(loc=sample_mean_shift, scale=sample_std, size=sample_size)

# Step 2: Perform one-sample t-test
t_stat, p_value = ttest_1samp(sample_data, popmean=population_mean)

# Step 3: Output results
print("One-Sample t-Test Results")
print(f"Sample mean: {np.mean(sample_data):.2f}")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")

# Step 4: Interpretation
alpha = 0.05
if p_value < alpha:
    print("Conclusion: Reject the null hypothesis — sample mean is significantly different from population mean.")
else:
    print("Conclusion: Fail to reject the null hypothesis — no significant difference from population mean.")


In [None]:
#25.D Write a Python script to perform a Z-test for comparing proportions between two datasets or groups
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

# Sample data: successes and sample sizes for two groups
successes = np.array([45, 30])  # number of successes in group 1 and group 2
samples = np.array([100, 80])   # total observations in group 1 and group 2

# Perform two-proportion z-test
stat, p_value = proportions_ztest(count=successes, nobs=samples)

print("Two-Proportion Z-Test Results")
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation at alpha = 0.05
alpha = 0.05
if p_value < alpha:
    print("Conclusion: Reject null hypothesis — proportions are significantly different.")
else:
    print("Conclusion: Fail to reject null hypothesis — no significant difference between proportions.")


In [None]:
#26.Implement an F-test for comparing the variances of two datasets, then interpret and visualize the results
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import f

# Simulate two datasets
np.random.seed(42)
data1 = np.random.normal(loc=0, scale=5, size=30)   # std dev = 5
data2 = np.random.normal(loc=0, scale=3, size=30)   # std dev = 3

# Calculate sample variances
var1 = np.var(data1, ddof=1)
var2 = np.var(data2, ddof=1)

# Calculate degrees of freedom
df1 = len(data1) - 1
df2 = len(data2) - 1

# Calculate F statistic
F = var1 / var2 if var1 > var2 else var2 / var1  # Always put larger variance in numerator

# Calculate p-value (two-tailed test)
p_value = 2 * min(f.cdf(F, df1, df2), 1 - f.cdf(F, df1, df2))

print(f"Variance of data1: {var1:.4f}")
print(f"Variance of data2: {var2:.4f}")
print(f"F-statistic: {F:.4f}")
print(f"P-value: {p_value:.4f}")

alpha = 0.05
if p_value < alpha:
    print("Conclusion: Reject the null hypothesis — variances are significantly different.")
else:
    print("Conclusion: Fail to reject the null hypothesis — no significant difference in variances.")

# Visualization: Plot F-distribution and mark critical regions
x = np.linspace(0, 5, 1000)
y = f.pdf(x, df1, df2)

# Critical values for two-tailed test
F_critical_low = f.ppf(alpha/2, df1, df2)
F_critical_high = f.ppf(1 - alpha/2, df1, df2)

plt.figure(figsize=(10, 6))
plt.plot(x, y, label='F-distribution')
plt.fill_between(x, y, where=(x <= F_critical_low), color='red', alpha=0.5, label='Rejection region (low)')
plt.fill_between(x, y, where=(x >= F_critical_high), color='red', alpha=0.5, label='Rejection region (high)')
plt.axvline(F, color='blue', linestyle='--', label=f'F statistic = {F:.2f}')
plt.axvline(F_critical_low, color='red', linestyle='--', label=f'Critical low = {F_critical_low:.2f}')
plt.axvline(F_critical_high, color='red', linestyle='--', label=f'Critical high = {F_critical_high:.2f}')
plt.title('F-distribution with Critical Regions for Variance Test')
plt.xlabel('F value')
plt.ylabel('Probability Density')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#27.Perform a Chi-square test for goodness of fit with simulated data and analyze the results.
import numpy as np
from scipy.stats import chisquare

# Step 1: Simulate observed data (counts of categories)
# Suppose we have 4 categories
observed = np.array([50, 30, 10, 10])

# Step 2: Define expected distribution (probabilities) and calculate expected counts
# Let's say the expected probabilities are uniform (equal probability for each category)
expected_prob = np.array([0.25, 0.25, 0.25, 0.25])
total_count = observed.sum()
expected = expected_prob * total_count

# Step 3: Perform Chi-square goodness-of-fit test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

# Step 4: Output results
print("Chi-square Goodness-of-Fit Test")
print(f"Chi-square statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.4f}")

alpha = 0.05
if p_value < alpha:
    print("Conclusion: Reject null hypothesis — observed distribution differs from expected.")
else:
    print("Conclusion: Fail to reject null hypothesis — observed distribution fits expected well.")


1.What is hypothesis testing in statistics?
ans.Hypothesis testing in statistics is a formal method used to make decisions or inferences about a population based on sample data. It helps determine whether there is enough evidence to support a specific claim (hypothesis) about a population parameter.


2.What is the null hypothesis, and how does it differ from the alternative hypothesis?
ans.The null hypothesis is a default assumption that there is no effect, no difference, or no relationship in the population,The alternative hypothesis is what you want to test or prove.
It states that there is an effect, a difference, or a relationship.
3.What is the significance level in hypothesis testing, and why is it important?
It represents the maximum probability of making a Type I error, which is rejecting the null hypothesis when it is actually true.

Commonly set values are 0.05 (5%), 0.01 (1%), or 0.10 (10%).
4.What does a P-value represent in hypothesis testing?
What is a P-value?
A P-value is the probability of obtaining test results at least as extreme as the observed results, assuming the null hypothesis (H₀) is true.
<!-- 5.How do you interpret the P-value in hypothesis testing -->
Compare the P-value to your significance level (α) — usually 0.05:

If P-value ≤ α:
The observed data is unlikely under the null hypothesis.
Action: Reject the null hypothesis (H₀).
Meaning: There is statistically significant evidence to support the alternative hypothesis (H₁).

If P-value > α:
The observed data is consistent with the null hypothesis.
Action: Fail to reject the null hypothesis.
Meaning: There is insufficient evidence to support the alternative hypothesis.

6.What are Type 1 and Type 2 errors in hypothesis testing
Type 1 Error (False Positive)
Occurs when you reject the null hypothesis (H₀) even though it is actually true.

In other words, you claim there is an effect/difference when there isn’t one.

The probability of making a Type 1 error is the significance level (α), commonly 0.05.

Example: Concluding a new drug works when it actually doesn’t.

Type 2 Error (False Negative)
Happens when you fail to reject the null hypothesis (H₀) even though the alternative hypothesis (H₁) is true.

You miss detecting a real effect or difference.

The probability of making a Type 2 error is denoted by β.

The power of a test (1 - β) is the probability of correctly rejecting a false null hypothesis.

Example: Concluding a drug doesn’t work when it actually does.

7.What is the difference between a one-tailed and a two-tailed test in hypothesis testing
One-tailed Test
Tests for an effect in only one direction (either greater than or less than).

Hypotheses example:

H₀: The mean is ≤ 10

H₁: The mean is > 10
(testing if mean is significantly greater than 10)

The critical region (rejection area) is only on one side of the sampling distribution.

Used when you have a specific direction in mind.

Two-tailed Test
Tests for an effect in both directions (whether greater or less than).

Hypotheses example:

H₀: The mean = 10

H₁: The mean ≠ 10
(testing if mean is significantly different from 10, either higher or lower)

The critical region is split between both tails of the sampling distribution.

Used when you want to detect any difference, regardless of direction.
8.What is the Z-test, and when is it used in hypothesis testing
A Z-test is a statistical test used to determine whether there is a significant difference between the sample mean and a known population mean (or between two means) when the population variance is known or the sample size is large (typically n > 30).
9. How do you calculate the Z-score, and what does it represent in hypothesis testing?
A Z-score (or standard score) measures how many standard deviations a data point (or sample mean) is from the population mean.
10.@ What is the T-distribution, and when should it be used instead of the normal distribution
What is the T-distribution?
The T-distribution (or Student’s t-distribution) is a probability distribution that is similar to the normal distribution but has heavier tails. This means it accounts for more variability, especially in the extremes.

When should you use the T-distribution instead of the Normal distribution?
When the sample size is small (usually
𝑛
<
30
n<30).

When the population standard deviation (σ) is unknown and must be estimated from the sample.
11.@ What is the T-distribution, and when should it be used instead of the normal distribution
11.What is the difference between a Z-test and a T-test


It adjusts for extra uncertainty due to estimating the standard deviation from limited data.
11.


11.What is the difference between a Z-test and a T-test
Use Z-test when you know the population standard deviation or have a large sample.

Use T-test when the population standard deviation is unknown and/or the sample size is small, so you estimate variability from your sample
12.What is a T-test?
A T-test is a statistical test used to determine whether there is a significant difference between the means of two groups or between a sample mean and a population mean when the population standard deviation is unknown and the sample size is small.

It uses the Student’s T-distribution, which accounts for extra uncertainty due to estimating the standard deviation from the sample.

13.What is the relationship between Z-test and T-test in hypothesis testing?
Relationship Between Z-test and T-test:
Both tests are used to compare sample means to a population mean or to compare means between groups.

The main difference lies in whether the population standard deviation (σ) is known:

If σ is known (or sample size is large enough to approximate it), you use a Z-test.

If σ is unknown and estimated from the sample, especially with a small sample size (n < 30), you use a T-test.
14.What is a confidence interval, and how is it used to interpret statistical results
A confidence interval is a range of values, calculated from sample data, that is likely to contain the true population parameter (e.g., mean) with a certain level of confidence.

For example, a 95% confidence interval means that if you repeated the sampling many times, approximately 95% of those intervals would contain the true population mean
15.What is the margin of error, and how does it affect the confidence interval?
The Margin of Error is the amount added and subtracted from the sample estimate to create the confidence interval. It represents the maximum expected difference between the true population parameter and the sample estimate due to sampling variability.
16.How is Bayes' Theorem used in statistics, and what is its significance
Bayes’ Theorem describes how to update the probability of a hypothesis based on new evidence or data. It’s a foundational concept in Bayesian statistics
17.What is the Chi-square distribution, and when is it used
The Chi-square distribution is a continuous probability distribution that arises from the sum of the squares of independent standard normal random variables.
18.What is the Chi-square goodness of fit test, and how is it applied?
The Chi-square goodness-of-fit test is a statistical test used to determine whether observed categorical data matches an expected distribution.
19.What is the F-distribution, and when is it used in hypothesis testing
The F-distribution is a continuous probability distribution that arises as the ratio of two scaled chi-square distributions. It is characterized by two sets of degrees of freedom
20.What is an ANOVA test, and what are its assumptions?
ANOVA stands for Analysis of Variance. It is a statistical method used to compare the means of three or more independent groups to see if at least one group mean differs significantly from the others.


