<a href="https://colab.research.google.com/github/Priyankapawar1224/Simple-and-Multiple-Regression/blob/main/hypothesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from scipy import stats
import numpy as np

# Load the dataset
df = pd.read_csv('heart_disease.csv')

# Display initial information (optional, but good for inspection)
print("--- Initial Data Inspection ---")
print(df.head())
print("\n")
df.info()

--- Initial Data Inspection ---
    Age  Gender  Blood Pressure  Cholesterol Level Exercise Habits Smoking  \
0  56.0    Male           153.0              155.0            High     Yes   
1  69.0  Female           146.0              286.0            High      No   
2  46.0    Male           126.0              216.0             Low      No   
3  32.0  Female           122.0              293.0            High     Yes   
4  60.0    Male           166.0              242.0             Low     Yes   

  Family Heart Disease Diabetes        BMI High Blood Pressure  ...  \
0                  Yes       No  24.991591                 Yes  ...   
1                  Yes      Yes  25.221799                  No  ...   
2                   No       No  29.855447                  No  ...   
3                  Yes       No  24.130477                 Yes  ...   
4                  Yes      Yes  20.486289                 Yes  ...   

  High LDL Cholesterol Alcohol Consumption Stress Level Sleep Hours  \
0

In [2]:
# 1. Handle missing values in the relevant columns
df_clean = df.dropna(subset=['Age', 'Heart Disease Status']).copy()

# 2. Separate the 'Age' data into two groups based on 'Heart Disease Status'
age_yes = df_clean[df_clean['Heart Disease Status'] == 'Yes']['Age']
age_no = df_clean[df_clean['Heart Disease Status'] == 'No']['Age']

# 3. Calculate and display sample statistics
n_yes = len(age_yes)
n_no = len(age_no)
mean_yes = np.mean(age_yes)
mean_no = np.mean(age_no)

print("\n--- Summary Statistics ---")
print(f"Heart Disease (Yes): N = {n_yes}, Mean Age = {mean_yes:.2f}")
print(f"Heart Disease (No): N = {n_no}, Mean Age = {mean_no:.2f}")


--- Summary Statistics ---
Heart Disease (Yes): N = 1996, Mean Age = 48.96
Heart Disease (No): N = 7975, Mean Age = 49.38


In [3]:
# Define the significance level
alpha = 0.05

# A. Levene's Test for Equality of Variances
levene_result = stats.levene(age_yes, age_no)

print("\n--- Levene's Test for Equality of Variances ---")
print(f"F-statistic: {levene_result.statistic:.4f}, p-value: {levene_result.pvalue:.4f}")

# Determine the 'equal_var' parameter for the t-test based on Levene's result
# If p-value >= alpha, we assume equal variances (Standard t-test).
equal_var = levene_result.pvalue >= alpha
test_type = "Standard (equal_var=True)" if equal_var else "Welch's (equal_var=False)"

# B. Two-Sample Independent t-test
t_test_result = stats.ttest_ind(age_yes, age_no, equal_var=equal_var)

print(f"\n--- Two-Sample Independent t-test ({test_type}) ---")
print(f"t-statistic: {t_test_result.statistic:.4f}, p-value: {t_test_result.pvalue:.4f}")

# C. Conclusion
print("\n--- Conclusion ---")
if t_test_result.pvalue < alpha:
    print(f"Reject H0: The difference in mean age is statistically significant (p-value < {alpha}).")
else:
    print(f"Fail to Reject H0: The difference in mean age is NOT statistically significant (p-value >= {alpha}).")


--- Levene's Test for Equality of Variances ---
F-statistic: 0.0037, p-value: 0.9516

--- Two-Sample Independent t-test (Standard (equal_var=True)) ---
t-statistic: -0.9235, p-value: 0.3558

--- Conclusion ---
Fail to Reject H0: The difference in mean age is NOT statistically significant (p-value >= 0.05).


In [22]:
import pandas as pd
from scipy.stats import ttest_ind

df = pd.read_csv('/content/heart_disease.csv')

group1 = df[df['Family Heart Disease']=='Yes']['Age']
group2 = df[df['Family Heart Disease']=='No']['Age']

t_stat, p_val = ttest_ind(group1, group2, nan_policy='omit')

print("Two Sample t-test")
print("t-statistic:", t_stat)
print("p-value:", p_val)

alpha = 0.05
if p_val < alpha:
    print("Conclusion: Reject the null hypothesis. Mean ages are significantly different.")
else:
    print("Conclusion: Fail to reject the null hypothesis. No significant difference in means.")


Two Sample t-test
t-statistic: -1.9537391883167072
p-value: 0.05076005523560995
Conclusion: Fail to reject the null hypothesis. No significant difference in means.


In [23]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel

df = pd.read_csv('/content/heart_disease.csv')

df['Chol_after'] = df['Cholesterol Level'] + np.random.normal(1, 1, len(df))

t_stat, p_val = ttest_rel(df['Cholesterol Level'], df['Chol_after'])

print("Paired Sample t-test")
print("t-statistic:", t_stat)
print("p-value:", p_val)

alpha = 0.05
if p_val < alpha:
    print("Conclusion: Reject H0. There is a significant difference before vs after.")
else:
    print("Conclusion: Fail to reject H0. No significant difference before vs after.")


Paired Sample t-test
t-statistic: nan
p-value: nan
Conclusion: Fail to reject H0. No significant difference before vs after.


In [24]:
import pandas as pd
from statsmodels.stats.weightstats import ztest

df = pd.read_csv('/content/heart_disease.csv')

sample = df['Age'].dropna()

z_stat, p_val = ztest(sample, value=50)

print("One Sample Z-test")
print("z-statistic:", z_stat)
print("p-value:", p_val)

alpha = 0.05
if p_val < alpha:
    print("Conclusion: Reject H0. Mean age is significantly different from 50.")
else:
    print("Conclusion: Fail to reject H0. Mean age is not different from 50.")


One Sample Z-test
z-statistic: -3.8623768252851227
p-value: 0.00011228918474820948
Conclusion: Reject H0. Mean age is significantly different from 50.


In [25]:
import pandas as pd
from statsmodels.stats.weightstats import ztest

df = pd.read_csv('/content/heart_disease.csv')

group1 = df[df['Family Heart Disease']=='Yes']['Age']
group2 = df[df['Family Heart Disease']=='No']['Age']

z_stat, p_val = ztest(group1, group2)

print("Two Sample Z-test")
print("z-statistic:", z_stat)
print("p-value:", p_val)

alpha = 0.05
if p_val < alpha:
    print("Conclusion: Reject H0. There is a significant difference in means.")
else:
    print("Conclusion: Fail to reject H0. No significant difference in means.")



Two Sample Z-test
z-statistic: nan
p-value: nan
Conclusion: Fail to reject H0. No significant difference in means.


In [26]:
import pandas as pd
from scipy.stats import chi2_contingency

df = pd.read_csv('/content/heart_disease.csv')

table = pd.crosstab(df['Gender'], df['Family Heart Disease'])

chi2, p_val, dof, expected = chi2_contingency(table)

print("Chi-Square Test")
print("Chi2 statistic:", chi2)
print("p-value:", p_val)

alpha = 0.05
if p_val < alpha:
    print("Conclusion: Reject H0. Variables are associated.")
else:
    print("Conclusion: Fail to reject H0. No association found.")


Chi-Square Test
Chi2 statistic: 0.0063293774772381515
p-value: 0.9365892897300181
Conclusion: Fail to reject H0. No association found.


ANOVA (F-Test)

In [29]:
import pandas as pd
from scipy.stats import f_oneway

df = pd.read_csv('/content/heart_disease.csv')

group_m = df[df['Gender']=='Male']['Age']
group_f = df[df['Gender']=='Female']['Age']

F_stat, p_val = f_oneway(group_m, group_f)

print("ANOVA F-test")
print("F-statistic:", F_stat)
print("p-value:", p_val)

alpha = 0.05
if p_val < alpha:
    print("Conclusion: Reject H0. At least one group mean is different.")
else:
    print("Conclusion: Fail to reject H0. Group means are similar.")


ANOVA F-test
F-statistic: nan
p-value: nan
Conclusion: Fail to reject H0. Group means are similar.
