In [None]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from prepare import remove_outliers, tts
import sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans


def target_dist(df):
    '''
    target_dist takes in a pandas dataframe, then proceeds to plot
    the distribution of the target variable, quality rating.
    '''
    # create histogram
    sns.histplot(x='quality', data=df, color='cyan', binwidth=0.5)
    # calculate mean
    mean_quality = df['quality'].mean()
    # add mean line
    plt.axvline(mean_quality, color='red', linestyle='--', label='Mean Quality')
    # add mean value as text label
    plt.text(mean_quality + .8, 1000, f'Mean Quality = {mean_quality:.2f}', 
             fontsize=10, color='red')
    # add labels and title
    plt.xlabel('Quality Rating')
    plt.title('Visualizing the Target Variable')
    # add legend and grid
    plt.legend()
    plt.grid(True, alpha=0.5, linestyle='--')
    # show plot
    plt.show()

def abv_plots(train):
    '''
    abv_plots takes in train data and produces a histplot comparing the mean ABV of 
    high quality wines versus lower quality wines.
    '''
    # identify low and high quality wines
    high= train[train['quality'] >= 7]
    low= train[train['quality'] <= 6]
    #define figure size
    plt.figure(figsize=(10, 5))
    # set title
    plt.title('High and Low Quality Wine ABV Compared')
    # identify low and high rated means
    mean_low_qual = low['alcohol'].mean()
    mean_high_qual = high['alcohol'].mean()
    # plot data
    sns.histplot(x='alcohol', data=low, alpha=.5, label= 'Lower Quality', color='orange')
    sns.histplot(x='alcohol', data=high, alpha=.3, label='High Quality', color='magenta')
    plt.xlabel(' % Alcohol by Volume')
    # draw low quality mean line
    plt.axvline(x=mean_low_qual, label='Lower Quality Mean ABV', color='orange')
    plt.text(mean_low_qual + 1.45, 150, f'Lower Quality Mean ABV = {mean_low_qual:.1f}%', 
             fontsize=10, color='red')
    # draw high quality mean line
    plt.axvline(x= mean_high_qual, label='High Quality Mean ABV', color='magenta')
    plt.text(mean_high_qual + .2, 200, f'High Quality Mean ABV= {mean_high_qual:.1f}%', 
             fontsize=10, color='red')
    # produce legen
    plt.legend()
    # show gridlines for easy reading
    plt.grid(True, alpha=0.75, linestyle='--')
    # show the vis
    plt.show()
    
def abv_tstat(train):
    '''
    abv_tstat takes in train data, the performs a two-sample t-test 
    comparing high quality and lower quality wines based on ABV.
    '''
    # create two samples
    high_quality = train[train['quality']>= 7]['alcohol']
    low_quality = train[train['quality'] <= 6]['alcohol']

    # perform two-sample t-test
    t_statistic, p_value = stats.ttest_ind(high_quality, low_quality, equal_var=True)

    # print results
    print(f"t-statistic: {t_statistic:.2f}")
    print(f"p-value: {p_value:.4f}")    
    
    
    
    #################################################
    


def q2_plots(df):
    q9= df[df['quality']>6]
    
    plt.figure(figsize=(10,5))
    plt.subplot(221)
    sns.histplot(x='chlorides', data=q9)
    plt.title('High Quality Wine (7-9)')
    plt.xlabel('Chlorides')
    plt.grid(True, alpha=0.3, linestyle='--')

    plt.subplot(222)
    sns.histplot(x='chlorides', data=df)
    plt.title('All Wine')
    plt.xlabel('Chlorides')
    plt.grid(True, alpha=0.3, linestyle='--')
    
    plt.figure(figsize=(25,10))
    plt.subplot(223)
    plt.title('High/All Quality Wines')
    sns.histplot(x='chlorides', data=df, alpha=.5, color='green', label= 'All Wines')
    sns.histplot(x='chlorides', data=q9, alpha=.75, label='High Quality Wines')
    plt.xlabel('Chlorides')
    plt.axvline(x=(df['chlorides'].mean()), color='red', label='All Wine Mean')
    plt.axvline(x=(q9['chlorides'].mean()), color='yellow', label='High Quality Mean')
    plt.legend()
    plt.grid(True, alpha=0.3, linestyle='--')
    plt.subplots_adjust(left=0.1,
                            bottom=-0.1,
                            right=0.9,
                            top=0.9,
                            wspace=0.4,
                            hspace=0.4)
    plt.show()

def q2_stat(df):
    alpha=0.05
    q9= df[df['quality']>6]
    chlmean=df['chlorides'].mean()
    t, p = stats.ttest_1samp(q9['chlorides'],chlmean)
    if p<alpha:
        print(f'The p-value of {p} is less than the alpha ({alpha}) so we can reject the null hypothesis!')
    else:
        print('The p-value is greater than the alpha, so we can not reject the null hypothesis.')


def q3_plots(df):
    q34 = df[df['quality'] < 6]
    q89 = df[df['quality'] > 6]
    plt.figure(figsize=(10,5))
    plt.subplot(221)
    sns.histplot(x='citric acid', data=q34)
    plt.title('Low Quality Wine (3-5)')
    plt.xlabel('Citric Acid')
    plt.grid(True, alpha=0.3, linestyle='--')
    
    plt.subplot(222)
    sns.histplot(x='citric acid', data=q89)
    plt.title('High Quality Wine (7-9)')
    plt.xlabel('Citric Acid')
    plt.grid(True, alpha=0.3, linestyle='--')
    
    plt.figure(figsize = (25,10))
    plt.subplot(223)
    plt.title('High and Low Quality Wines')
    sns.histplot(x='citric acid', data=q34, alpha=.25, color='green', label= 'Low Quality')
    sns.histplot(x='citric acid', data=q89, alpha=.50, label='High Quality')
    plt.xlabel('Citric Acid')
    plt.axvline(x=(q34['citric acid'].mean()), color='red', label='Low Quality Mean')
    plt.axvline(x=(q89['citric acid'].mean()), color='yellow', label='High Quality Mean')
    plt.legend()
    plt.grid(True, alpha=0.3, linestyle='--')
    
    plt.subplots_adjust(left=0.1,
                            bottom=-0.1,
                            right=0.9,
                            top=0.9,
                            wspace=0.4,
                            hspace=0.4)
    plt.show()


def q3_stat(df):
    alpha=0.05
    q34 = df[df['quality'] < 6]
    q89 = df[df['quality'] > 6]
    t, p = stats.ttest_ind(q34['citric acid'], q89['citric acid'], equal_var=False, alternative='less')
    if p<alpha:
        print(f'The p-value of {p} is less than the alpha ({alpha}) so we can reject the null hypothesis!')
    else:
        print('The p-value is greater than the alpha, so we can not reject the null hypothesis.')


def q4_plots(df):
    q34 = df[df['quality'] < 6]
    q89 = df[df['quality'] > 6]
    plt.figure(figsize=(10,5))
    plt.subplot(221)
    sns.histplot(x='pH', data=q34)
    plt.title('Low Quality Wine (3-5)')
    plt.xlabel('pH')
    plt.grid(True, alpha=0.3, linestyle='--')
    
    plt.subplot(222)
    sns.histplot(x='pH', data=q89)
    plt.title('High Quality Wine (7-9)')
    plt.xlabel('pH')
    plt.grid(True, alpha=0.3, linestyle='--')
    
    plt.figure(figsize = (25,10))
    plt.subplot(223)
    plt.title('High and Low Quality Wines')
    sns.histplot(x='pH', data=q34, alpha=.25, color='green', label= 'Low Quality Wine')
    sns.histplot(x='pH', data=q89, alpha=.50, label='High Quality Wine')
    plt.xlabel('pH')
    plt.axvline(x=(q34['pH'].mean()), color='red', label='Low Quality Mean')
    plt.axvline(x=(q89['pH'].mean()), color='yellow', label='High Quality Mean')
    plt.legend()
    plt.grid(True, alpha=0.3, linestyle='--')
    
    plt.subplots_adjust(left=0.1,
                            bottom=-0.1,
                            right=0.9,
                            top=0.9,
                            wspace=0.4,
                            hspace=0.4)
    plt.show()


def q4_stat(df):
    alpha=0.05
    q34 = df[df['quality'] < 6]
    q89 = df[df['quality'] > 6]
    t, p = stats.ttest_ind(q34['pH'], q89['pH'], equal_var=False, alternative='less')
    if p<alpha:
        print(f'The p-value of {p} is less than the alpha ({alpha}) so we can reject the null hypothesis!')
    else:
        print('The p-value is greater than the alpha, so we can not reject the null hypothesis.')