In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
# Q.1 - Simulate a fair dice roll 1000 times using numpy.random.randint().

dice_rolls = np.random.randint(1, 7, size=1000)
print(dice_rolls[:10])
print("\n-----------------------------------------------------\n")

# - Calculate the probability of each outcome using frequency count.

outcome_counts = pd.Series(dice_rolls).value_counts(normalize=True)
print(outcome_counts)
print("\n-----------------------------------------------------\n")

# - Simulate a coin toss 500 times.

coin_tosses = np.random.choice(['Heads', 'Tails'], size=500)
print(coin_tosses[:10])
print("\n-----------------------------------------------------\n")

# - Estimate the probability of getting heads and tails.

coin_outcome_counts = pd.Series(coin_tosses).value_counts(normalize=True)
print(coin_outcome_counts)
print("\n-----------------------------------------------------\n")

# - Use collections. Counter or value_counts() to calculate experimental probability.

from collections import Counter

coin_tosses = ['Heads', 'Tails', 'Heads', 'Heads', 'Tails', 'Tails', 'Heads', 'Tails', 'Heads', 'Tails']
counts = Counter(coin_tosses)
total = sum(counts.values())
probabilities = {outcome: count / total for outcome, count in counts.items()}
print(probabilities)

[4 1 4 4 1 4 2 3 6 3]

-----------------------------------------------------

3    0.178
4    0.173
2    0.171
1    0.165
6    0.158
5    0.155
Name: proportion, dtype: float64

-----------------------------------------------------

['Tails' 'Tails' 'Heads' 'Tails' 'Heads' 'Tails' 'Heads' 'Heads' 'Tails'
 'Heads']

-----------------------------------------------------

Heads    0.518
Tails    0.482
Name: proportion, dtype: float64

-----------------------------------------------------

{'Heads': 0.5, 'Tails': 0.5}


In [None]:
# Q.2 - Simulate two sets of data with slightly different means.

np.random.seed(42)
data1 = np.random.normal(loc=50, scale=5, size=1000)
data2 = np.random.normal(loc=52, scale=5, size=1000)
data1.mean(), data2.mean()
print(f"Mean of data1: {data1.mean()}, Mean of data2: {data2.mean()}")

# - Perform a t-test using scipy.stats.ttest_ind()) to get the p-value.

t_stat, p_value = stats.ttest_ind(data1, data2)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

Mean of data1: 50.096660279111624, Mean of data2: 52.35418118624578
T-statistic: -10.21466312215938, P-value: 6.522675122877795e-24


# - Interpret p-value in markdown. Mention what it implies for the null hypothesis.
The p-value is the probability of obtaining results at least as extreme as the observed results, assuming that the null hypothesis is true.

| **p-value** | **Interpretation**                                                                      |
| ----------- | --------------------------------------------------------------------------------------- |
| `p > 0.05`  | Weak evidence **against** the null hypothesis → **Fail to reject** the null hypothesis. |
| `p ≤ 0.05`  | Strong evidence **against** the null hypothesis → **Reject** the null hypothesis.       |
| `p ≤ 0.01`  | Very strong evidence **against** the null hypothesis → **Reject** with high confidence. |
| `p ≤ 0.001` | Extremely strong evidence **against** the null → Highly significant result.             |


In [24]:
# Q.3 - Create a sample dataset with scores before and after a training program.

before_scores = np.random.normal(loc=70, scale=10, size=30)
after_scores = np.random.normal(loc=75, scale=10, size=30)

# - Use a paired t-test or an independent t-test to check for improvement.

t_stat, p_value = stats.ttest_rel(before_scores, after_scores)
print(f"Paired T-statistic: {t_stat}, P-value: {p_value}")
print("\n-----------------------------------------------------\n")

# - Calculate a confidence interval for the mean using scipy.stats.sem() and stats.t.interval().

n = len(before_scores)
n = len(after_scores)

mean = np.mean(before_scores)
mean = np.mean(after_scores)

sem = stats.sem(before_scores)
sem = stats.sem(after_scores)  # sem = standard deviation / sqrt(n)

confidence = 0.95

confidence_interval = stats.t.interval(confidence, df=n-1, loc=mean, scale=sem)

print("Mean:", mean)
print("95% Confidence Interval:", confidence_interval)

# - Set a significance level (a = 0.05) and determine if the null hypothesis should be rejected.

alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in scores before and after the training program.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in scores before and after the training program.")

Paired T-statistic: -2.3453508116893143, P-value: 0.026060803515081003

-----------------------------------------------------

Mean: 76.03834766892923
95% Confidence Interval: (np.float64(72.33972084030087), np.float64(79.7369744975576))
Reject the null hypothesis: There is a significant difference in scores before and after the training program.


# Q.4 - In markdown, describe:
# - Type I Error (False Positive)
# - Type II Error (False Negative)

| Decision ↓ / Reality → | H₀ True        | H₀ False         |
| ---------------------- | -------------- | ---------------- |
| **Reject H₀**          | Type I Error ❌ | ✅ Correct        |
| **Fail to Reject H₀**  | ✅ Correct      | Type II Error ⚠️ |