# 1. One-Sample T-Test
This test will determine if the average math score of a sample of students is significantly different from a hypothesized value. Let's hypothesize that the average math score is 65.

Null Hypothesis (H0): The average math score of the students is 65.

Alternative Hypothesis (H1): The average math score of the students is not 65.

In [None]:
import pandas as pd
from scipy import stats

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Take a sample of 100 students
sample_df = df.sample(n=100, random_state=42)

# Hypothesized population mean for math score
population_mean_math = 65

# Perform one-sample t-test
t_statistic, p_value = stats.ttest_1samp(sample_df['math score'], population_mean_math)

print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis: The average math score is significantly different from 65.")
else:
    print("Fail to reject the null hypothesis: There is not enough evidence to say the average math score is different from 65.")

# 2. Two-Sample (Independent) T-Test
Here, we will compare the mean math score between male and female students.

Null Hypothesis (H0): There is no difference in the mean math score between male and female students.

Alternative Hypothesis (H1): There is a significant difference in the mean math score between male and female students.

In [None]:
import pandas as pd
from scipy import stats

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Separate the data based on gender
male_scores = df[df['gender'] == 'male']['math score']
female_scores = df[df['gender'] == 'female']['math score']

# Perform independent t-test
t_statistic, p_value = stats.ttest_ind(male_scores, female_scores, equal_var=False)

print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in math scores between male and female students.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in math scores between male and female students.")

# 3. Paired Sample T-Test

This test will determine if there is a significant difference between students' math score and their reading score.

Null Hypothesis (H0): There is no significant difference between the mean math score and the mean reading score.

Alternative Hypothesis (H1): There is a significant difference between the mean math score and the mean reading score.

In [None]:
import pandas as pd
from scipy import stats

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Perform the Paired Sample T-Test
t_statistic, p_value = stats.ttest_rel(df['math score'], df['reading score'])

print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis: There is a statistically significant difference between math and reading scores.")
else:
    print("Fail to reject the null hypothesis: There is no statistically significant difference between math and reading scores.")

# 4. One-Sample Z-Test
We'll use a Z-test to see if the average reading score of all students in the dataset is significantly different from 70.

Null Hypothesis (H0): The average reading score is 70.

Alternative Hypothesis (H1): The average reading score is not 70.

In [None]:
import pandas as pd
from statsmodels.stats import weightstats as stests

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Perform one-sample z-test on reading score
z_statistic, p_value = stests.ztest(df['reading score'], value=70)

print(f"Z-statistic: {z_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis: The average reading score is significantly different from 70.")
else:
    print("Fail to reject the null hypothesis: There is not enough evidence to say the average reading score is different from 70.")

# 5. Two-Sample Z-Test
This test will compare the mean writing score of students who completed the test preparation course versus those who did not.

Null Hypothesis (H0): There is no difference in the mean writing score between the two groups.

Alternative Hypothesis (H1): There is a significant difference in the mean writing score.

In [None]:
import pandas as pd
from statsmodels.stats import weightstats as stests

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Separate writing scores
completed_scores = df[df['test preparation course'] == 'completed']['writing score']
none_scores = df[df['test preparation course'] == 'none']['writing score']

# Perform two-sample z-test
z_statistic, p_value = stests.ztest(completed_scores, none_scores, value=0)

print(f"Z-statistic: {z_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in writing scores based on test preparation.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in writing scores.")

# 6. Chi-Square Test
We'll test for an association between the race/ethnicity of the students and their parental level of education.

Null Hypothesis (H0): There is no association between race/ethnicity and parental level of education.

Alternative Hypothesis (H1): There is an association between race/ethnicity and parental level of education.

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Create a contingency table
contingency_table = pd.crosstab(df['race/ethnicity'], df['parental level of education'])
print("Contingency Table:")
print(contingency_table)

# Perform the chi-square test
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-square Statistic: {chi2_stat}")
print(f"P-value: {p_value}")
print(f"Degrees of Freedom: {dof}")

# Interpret the results
if p_value < 0.05:
    print("\nReject the null hypothesis: There is a significant association between race/ethnicity and parental level of education.")
else:
    print("\nFail to reject the null hypothesis: There is no significant association.")

# 7. ANOVA (F-Test)
## One-Way ANOVA
This will test if the mean math score differs across the different race/ethnicity groups.

Null Hypothesis (H0): The mean math score is the same for all race/ethnicity groups.

Alternative Hypothesis (H1): The mean math score is different for at least one group.

In [None]:
import pandas as pd
from scipy import stats

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Group data by race/ethnicity
grouped_data = [df['math score'][df['race/ethnicity'] == group] for group in df['race/ethnicity'].unique()]

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(*grouped_data)

print(f"F-statistic: {f_statistic}")
print(f"P-value: {p_value}")

# Interpret the results
if p_value < 0.05:
    print("Reject the null hypothesis: There is a significant difference in math scores across race/ethnicity groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in math scores.")

## Two-Way ANOVA
We will examine the influence of gender and test preparation course on the math score.

Null Hypothesis (H0): There is no significant effect of gender or test preparation on the math score.

Alternative Hypothesis (H1): There is a significant effect.

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Correcting column names for formula
df.rename(columns={
    'race/ethnicity': 'race_ethnicity',
    'parental level of education': 'parental_education',
    'test preparation course': 'test_prep_course',
    'math score': 'math_score',
    'reading score': 'reading_score',
    'writing score': 'writing_score'
}, inplace=True)


# Perform two-way ANOVA
model = ols('math_score ~ C(gender) * C(test_prep_course)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

print(anova_table)