In [1]:
from scipy import stats
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

In [2]:
# importing files
red_wine = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
white_wine = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

In [3]:
# adding columns to dataframe
red_wine['wine_type'] = 'red'
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
                                                      if value <= 5 else 'medium'
                                                      if value <= 7 else 'high')
red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
                                           categories=['low', 'medium', 'high'])
red_wine[red_wine["quality"] > 6]
white_wine['wine_type'] = 'white'
white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
                                                          if value <= 5 else 'medium'
                                                          if value <= 7 else 'high')
white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
                                             categories=['low', 'medium', 'high'])
white_wine[white_wine["quality"] > 6]

# combine dataframes

wines = pd.concat([red_wine, white_wine])
# re-shuffle records just to randomize data points
wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
F, p = stats.f_oneway(wines[wines['quality_label'] == 'low']['alcohol'],
wines[wines['quality_label'] == 'medium']['alcohol'],
wines[wines['quality_label'] == 'high']['alcohol'])
print('ANOVA test for mean alcohol levels across wine samples with different quality ratings')
print('F Statistic:', F, '\tp-value:', p)
if p < 0.05:
    print("Accepting Alternative Hypothesis")
else:
    print("Accepting Null Hypothesis")

ANOVA test for mean alcohol levels across wine samples with different quality ratings
F Statistic: 673.0745347231032 	p-value: 2.2715337450621843e-266
Accepting Alternative Hypothesis


In [7]:
F, p = stats.f_oneway(wines[wines['quality_label'] == 'low']['pH'],
wines[wines['quality_label'] == 'medium']['pH'],
wines[wines['quality_label'] == 'high']['pH'])
print('ANOVA test for mean pH levels across wine samples with different quality ratings')
print('F Statistic:', F, '\tp-value:', p)
if p < 0.05:
    print("Accepting Alternative Hypothesis")
else:
    print("Accepting Null Hypothesis")

ANOVA test for mean pH levels across wine samples with different quality ratings
F Statistic: 1.2363860803545201 	p-value: 0.2905002779768688
Accepting Null Hypothesis


In [8]:
F, p = stats.f_oneway(wines[wines['wine_type'] == 'red']['residual sugar'],
wines[wines['wine_type'] == 'white']['residual sugar'])
print('ANOVA test for mean residual sugar levels across wine samples with different quality ratings')
print('F Statistic:', F, '\tp-value:', p)
if p < 0.05:
    print("Accepting Alternative Hypothesis")
else:
    print("Accepting Null Hypothesis")

ANOVA test for mean residual sugar levels across wine samples with different quality ratings
F Statistic: 899.7662891248706 	p-value: 2.959354152712964e-185
Accepting Alternative Hypothesis
