# Introduction

In [None]:
%autosave 5

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import data
data = pd.read_csv('student.csv')

# Show first five rows
data.head()

In [None]:
# Create function to calculate weight average of alcohol consumption
def weighted_alc(Dalc, Walc):
    return (5 * Dalc + 2 * Walc) / 7

# Create variable `alc`
data['alc'] = weighted_alc(data['Dalc'], data['Walc']) <= 2
data['alc'] = data['alc'].map({True: 'Low', False: 'High'})

# Show first five rows
data['alc'].head()

# Confidence intervals

In [None]:
# Import scipy.stats
import scipy.stats as stats

## Confidence interval for a mean

In [None]:
# Find sample mean of final grade
sample_mean_grade = data['G3'].mean()
sample_mean_grade

In [None]:
# Find sample size
sample_size = data.shape[0]
sample_size

In [None]:
# Find standard error of the mean of final grade
std_error_grade = data['G3'].std() / np.sqrt(sample_size)
std_error_grade

In [None]:
# Calculate the 95% CI for mean final grade
stats.norm.interval(0.95, loc=sample_mean_grade, scale=std_error_grade)

## Confidence interval for a proportion

In [None]:
# Find sample proportion of students with high alcohol consumption
sample_prop = data['alc'].value_counts(normalize=True)['High']
sample_prop

In [None]:
# TASK --- Calculate the 99% CI for proportion of students with high alcohol consumption


# Probability calculations

In [None]:
# Binomial distribution pmf for k=5
stats.binom.pmf(k=5, n=10, p=0.25)

In [None]:
# Binomial distribution pmf for all k
all_pmf = stats.binom.pmf(k=np.arange(11), n=10, p=0.25)
all_pmf

In [None]:
# Calculate cumulative distribution function
cdf = stats.binom.cdf(k=np.arange(11), n=10, p=0.25)

# Plot binomial dist pmf and cdf
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
sns.barplot(x=np.arange(11), y=all_pmf, ax=ax[0]
           ).set_title('Probability Mass Function')
sns.lineplot(x=np.arange(11), y=cdf, ax=ax[1]
            ).set_title('Cumulative Distribution Function')
plt.show()

In [None]:
# TASK --- Calculate probability of 4 or less students with high alcohol consumption in class of 20


# Hypothesis testing

### Variance in grades in students with low vs. high alcohol consumption

In [None]:
# Split grade data into two groups based on alcohol consumption level
grades_low_alc = data[data['alc'] == 'Low']['G3']
grades_high_alc = data[data['alc'] == 'High']['G3']

In [None]:
# Perform Bartlett's Test
stats.bartlett(grades_low_alc, grades_high_alc)

### Effect of alcohol consumption on academic performance

In [None]:
# Plot academic performance by group
sns.boxplot(x='alc', y='G3', data=data)
plt.show()

In [None]:
# Perform t-test
stats.ttest_ind(grades_low_alc, grades_high_alc, equal_var=True)

### Alcohol consumption by males vs. females

In [None]:
# Plot `sex` and `acl`
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.countplot(data['sex'], ax=ax[0]).set_title('Sex')
sns.countplot(data['alc'], ax=ax[1]).set_title('Alcohol Consumption')
plt.show()

In [None]:
# Plot `acl`, conditioning on `sex`
cond_plot = sns.FacetGrid(data, col='sex', hue='alc', height=4, aspect=1)
cond_plot.map(sns.countplot, 'alc', order=['Low', 'High'])
plt.show()

In [None]:
# Create contingency table
sex_alc_table = pd.crosstab(data['alc'], data['sex'])
sex_alc_table

In [None]:
# Calculate proportions of high alcohol consumers for each group
m_high_prop = sex_alc_table['M']['High'] / sex_alc_table['M'].sum()
f_high_prop = sex_alc_table['F']['High'] / sex_alc_table['F'].sum()

print(f'Proportion of males with high alc: {m_high_prop}')
print(f'Proportion of females with high alc: {f_high_prop}')

In [None]:
# Perform chi-squared test
chi_stat, p_value, dof, expected_freq = stats.chi2_contingency(sex_alc_table)

print(f'Test statistic: {chi_stat}')
print(f'p-value: {p_value}')
print(f'Degs of freedom: {dof}')
print(f'Expected frequencies: {expected_freq}')

In [None]:
# Store expected frequencies in pandas DataFrame
expected_freq_table = pd.DataFrame(expected_freq,
                                   index=['High', 'Low'],
                                   columns=['F', 'M'])
expected_freq_table

#  Activity: Putting it all together
## Alcohol consumption by school

In [None]:
# TASK --- Plot `school` and `alc`
fig, ax = plt.subplots(1, 2, figsize=(10, 5))


In [None]:
# TASK --- Plot `alc`, conditioning on `school`


In [None]:
# TASK --- Create contingency table


In [None]:
# TASK --- Calculate proportions of high alcohol consumers for each group


In [None]:
# TASK --- Perform chi-squared test


In [None]:
# TASK --- Store expected frequencies in pandas DataFrame
