# Explore: 
1. Univariate exploration: Explore feature variable individually.
    - For numeric variables, create a histogram and use .describe(). For categorical variables, use a frequency table and a bar plot of that frequency table.
2. Bivariate exploration: Plot each variable against your target.
    - For a categorical target variable, your target can be on the x-axis, and numeric variables on the y. For testing, you can compare your categorical target to numeric variables using comparison of means tests, such as t-test, anova or mann-whitney. You can compare your categorical target to categorical variables using a chi-square test.
    - For a numeric target variable, your target can be on the y-axis, and independent variables on the x-axis. For independent variables that are numeric, scatterplots are useful. For independent variables that are categorical or discrete, bar plots, swarm plots or violin plots are useful.
3. Multivariate exploration: Visualize multiple (3+) variables at once.
     - With a categorical target, plot each categorical variable (x-axis) against each numeric variable (y-axis) and set color to your categorical target variable. You could use a bar plot, swarm plot, violin plot, box plot.
    - When plotting a numeric target against categorical independent variables, your y-axis is the target and your x-axis is categorical variables (bar, swarm, violin, box plots). You can set color to be another one of the categorical variables. One you are most interested in, for example.
- When plotting a numeric target against numeric independent variables, your y-axis should be your target and your x-axis should be a numeric independent variable. Color can be added from one of the categorical variables to add a dimension.
- Document any findings, insights, thoughts, takeaways from the charts and statistics you produced. In that, be sure and include any additional features you would like to develop if there is time, which features need to be removed, and which are free to move forward.

In [2]:
# imports:
import pandas as pd
import numpy as np
import wrangle as wr

# vizualizations
import matplotlib.pyplot as plt
import seaborn as sns

# statistics
from scipy import stats

# warnings:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Lets acquire the data: 
df = wr.acquire_water()

In [None]:
# prepare the data: 
df = wr.prep_water(df)

In [None]:
df.head()

In [None]:
# split up the x variables into categorical and numerical:
cat_cols, num_cols = [], []
# for a column in the X_train data frame:
for col in df.columns:
    # if the column is an object:
    if df[col].dtype == 'O':
        # add the column to the category list
        cat_cols.append(col)
    # else not an object:   
    else:
        # if the column has unique values under 10:
        if df[col].nunique() < 10:
            # add the columns to category list
            cat_cols.append(col)
        # else not object and unique is not under 10:
        else:
            # and the column to the numbers list
            num_cols.append(col)

In [None]:
cat_cols

In [None]:
num_cols

All of our columns are numerical data. The only categorical data is potability but this our target variable.

# Univariate Exploration:
- We dont need to split the data only looking a univariate variables

In [None]:
# Looking at Numerical Values: 
for col in num_cols:
    # print out the title:
    print (f'Distribution of {col}')
    # show descriptive statistics
    print(df[col].describe())
    # first graph is a histogram: 
    sns.histplot(data=df, x = col, kde=True)
    plt.show()
    # second graph is boxplot
    #plt.boxplot(train[col])
    #plt.show()
    sns.boxplot(data=df, x = col)
    plt.show()
    print('=======================')

### Take aways: 
1. pH = min: 0.23 / max: 14.00
- Data looks normal, outliers don't look like and issue
2. hardness = min: 73.49 / max: 317.34
- Minor skew to the right, outliers don't look like and issue
3. solids = min: 320.94 / max: 56488.67
- Positive Skew, Outliers don't look like and issue.
4. chloramines = min: 1.39 / max: 13.13
- data looks normal, outliers don't look like an issue
5. sulfate = min: 129.00 / max: 481.03
- data looks negatively skewed, outlires dont look noraml, could remove one
6. conductivity = min: 201.62 / max: 753.34
- data looks normal, outliers don't look bad
7. organic_carbon = min: 2.20 / max: 27.01
- data looks normal, outliers don't look like an issue
8. trihalomethanes = min: 8.58 / max: 124.00
- looks skewed right, outlires don't look like an issue
9. trubidity = min: 1.45 / max = 6.49
- looks nomral, outliers don't seem to be an issue

In [None]:
# function to evaluate if distribution is normal
def eval_dist(r, p, α=0.05):
    """
    This function will take in:
    - r: the test statistic
    - p: p-value
    - α: id defaulted to 5%
    and print out if the data used to create r & p from the stats.shapiro test is normally distributed.
    """
    if p > α:
        return print(f"""The data is normally distributed""")
    else:
        return print(f"""The data is NOT normally distributed""")

In [None]:
# lets check if they are normal: 
def evaluate_normality(data, column_name):
    """
    This function will take in a column_name, and then run a shapiro test
    """
    r, p = stats.shapiro(data[column_name])
    print(f"{column_name} distribution:")
    print("Shapiro-Wilk Test Results:")
    print(f"Statistic (r): {r}")
    print(f"P-value (p): {p}")
    print(eval_dist(r, p))

In [None]:
evaluate_normality(df, 'ph')

In [None]:
for column in num_cols:
    evaluate_normality(df, column)

In [None]:
# look at all the categorical features
for col in cat_cols:
    print(f'Frequncy of {col}')
    print(df[col].describe())
    print(df[col].value_counts())
    
    #since there are multiple columns i want them to display side by side
    fig, ax = plt.subplots(1,2, figsize=(10,8))
    fig.suptitle(f'Graphs of {col}')
    
    #plot one: 
    sns.countplot(data = df, x = col, ax=ax[0], palette = 'Set2')
    
    #plot two:
    sns.boxplot(ax=ax[1], data = df, x = df[col].value_counts(), color = 'skyblue')

   
    plt.show()
    print('----------###---------')

### Takeaways: 
1. There is more 0's than 1's meaning that there is more undrinkable water in the data frame. 

# Bivariate Statistics: 
- Data needs to be split: 

In [None]:
# import: 
from sklearn.model_selection import train_test_split

In [None]:
def split_water(df):
    '''
    This function will split my data
    '''
    train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify = df.potability)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123, stratify = train_validate.potability)
    
    return train, validate, test

In [None]:
train, validate, test = split_water(df)

In [None]:
# create my target varible: 
target = 'potability'

In [None]:
# swarm plot:
for col in num_cols:
    # create the plot:
    sns.swarmplot(data = train, x = target, y = col, color = 'lightseagreen')
    plt.title(f'Graph for {col}')
    average = train[col].mean()
    plt.axhline(average, ls = '--', color = 'black')
    plt.show()  

In [None]:
# boxen graph with average mean:
for col in num_cols:
    # create the plot:
    sns.boxenplot(data = train, x = target, y = col, palette = 'Set2')
    plt.title(f'Graph for {col}')
    average = train[col].mean()
    plt.axhline(average, ls = '--', color = 'black')
    plt.show()

In [None]:
# bar plot
for col in num_cols:
    plt.title(f'Graph of {col}')
    sns.barplot(x = target, y = col, data = train, palette = 'mako')
    col_mean = train[col].mean()
    plt.axhline(col_mean, label = (f'Mean of {col}'), color = 'black')
    plt.legend()
    
    plt.show()

# Need to complete a Mann-Whitney Test: 

In [None]:
def mann_whit_test(df, num_cols):
    for col in num_cols:
        stat, p_val = stats.mannwhitneyu(
            train[train.potability == 0][col],
            train[train.potability == 1][col])
        print(f'Mann-Whitneyu test for {col}')
        if p_val < 0.05:
            print('We can reject the null hypothesis; the means are different.')
        else:
            print('We cannot reject the null hypothesis')

In [None]:
mann_whit_test(df, num_cols)

# Lets find cluster: 

In [None]:
sns.pairplot(data = train, hue = 'potability', corner = True)

Is there a relationship betwwen ph and hardness: 

In [None]:
# continous vs continous: Data is not normal: 
def eval_Spearmanresult(r,p,α=0.05):
    """
    
    """
    if p < α:
        return print(f"""We reject H₀, there appears to be a monotonic relationship.
Spearman's rs: {r:2f}.
P-value: {p}""")
    else:
        return print(f"""We fail to reject H₀: that there does not appear to be a monotonic relationship.
Spearman’s r: {r:2f}
P-value: {p}""")

### Relationship between sulfates and ph

In [None]:
sns.scatterplot(data = train, y = 'sulfate', x = 'ph', hue = 'potability')
plt.title('Sulfates vs pH');

In [None]:
# ph and sulfates
r, p = stats.spearmanr(train.ph, train.sulfate)
eval_Spearmanresult(r,p)

### Relationship between hardness and pH

In [None]:
sns.scatterplot(data = train, y = 'hardness', x = 'ph', hue = 'potability');

In [None]:
# ph and hardness
r, p = stats.spearmanr(train.ph, train.hardness)
eval_Spearmanresult(r,p)

### Relaptionship between turbidity and chloramines?

In [None]:
sns.scatterplot(data = train, y = 'turbidity', x = 'chloramines', hue = 'potability');

In [None]:
# turbidity and chloramines
r, p = stats.spearmanr(train.turbidity, train.chloramines)
eval_Spearmanresult(r,p)

### Trihalomethanes and conducitivy

In [None]:
sns.scatterplot(data = train, y = 'trihalomethanes', x = 'conductivity', hue = 'potability');

In [None]:
# trihalomethanes and conductivity
r, p = stats.spearmanr(train.trihalomethanes, train.conductivity)
eval_Spearmanresult(r,p)

### Is there something with a relationship with turbidity?

In [None]:
cols = ['ph','hardness','solids', 'chloramines', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes']
for col in cols:
    print(f'{col} vs turbidity')
    r, p = stats.spearmanr(train[col], train.turbidity)
    eval_Spearmanresult(r,p)
    print('-------')

### Hardness: 

In [None]:
cols = ['ph','turbidity','solids', 'chloramines', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes']
for col in cols:
    print(f'{col} vs hardness')
    r, p = stats.spearmanr(train[col], train.hardness)
    eval_Spearmanresult(r,p)
    print('-------')

### Solids

In [None]:
cols = ['ph','turbidity','hardness', 'chloramines', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes']
for col in cols:
    print(f'{col} vs solids')
    r, p = stats.spearmanr(train[col], train.solids)
    eval_Spearmanresult(r,p)
    print('-------')

### Chloramines

In [None]:
cols = ['ph','turbidity','hardness', 'solids', 'sulfate', 'conductivity', 'organic_carbon', 'trihalomethanes']
for col in cols:
    print(f'{col} vs chloramines')
    r, p = stats.spearmanr(train[col], train.chloramines)
    eval_Spearmanresult(r,p)
    print('-------')

### Sulfate

In [None]:
cols = ['ph','turbidity','hardness', 'solids', 'chloramines', 'conductivity', 'organic_carbon', 'trihalomethanes']
for col in cols:
    print(f'{col} vs sulfate')
    r, p = stats.spearmanr(train[col], train.sulfate)
    eval_Spearmanresult(r,p)
    print('-------')

### Conductivity:

In [None]:
cols = ['ph','turbidity','hardness', 'solids', 'sulfate', 'chloramines', 'organic_carbon', 'trihalomethanes']
for col in cols:
    print(f'{col} vs conductivity')
    r, p = stats.spearmanr(train[col], train.conductivity)
    eval_Spearmanresult(r,p)
    print('-------')

### Organic_carbon

In [None]:
cols = ['ph','turbidity','hardness', 'solids', 'sulfate', 'chloramines', 'conductivity', 'trihalomethanes']
for col in cols:
    print(f'{col} vs organic carbon')
    r, p = stats.spearmanr(train[col], train.organic_carbon)
    eval_Spearmanresult(r,p)
    print('-------')

### trihalomethanes

In [None]:
cols = ['ph','turbidity','hardness', 'solids', 'sulfate', 'chloramines', 'conductivity', 'organic_carbon']
for col in cols:
    print(f'{col} vs trihalomethanes')
    r, p = stats.spearmanr(train[col], train.trihalomethanes)
    eval_Spearmanresult(r,p)
    print('-------')

In [None]:
# scale the columns using minmaxscaler
from sklearn.preprocessing import StandardScaler

In [None]:
# create the object: 
train_scaled = train.copy()
scaler = StandardScaler()

In [None]:
# fit and transform the object
train_scaled[num_cols] = scaler.fit_transform(train[num_cols])

In [None]:
#correlation: 
train_corr = train_scaled[num_cols].corr(method='spearman')
train_corr

In [None]:
sns.heatmap(train_corr, cmap= 'YlGnBu', annot=True, linewidth=0.5,
           mask = np.triu(train_corr))

In [None]:
0