In [1]:
import pandas as pd

### Define a function named get_lower_and_upper_bounds that has two arguments. The first argument is a pandas Series. The second argument is the multiplier, which should have a default argument of 1.5.

In [170]:
def upper_outliers(s, k):
    '''
    Accepts series and cutoff value.
    If a value in the series is an upper outlier, it returns a number that represents how far above the value is from the upper bound
    or 0 if the number is not an outlier.
    '''
    # creating 2 variables that represent the 1st and 3rd quantile of the given series
    q1, q3 = s.quantile([.25, .75])

    # calculating IQR
    iqr = q3 - q1

    # calculating upper bound
    upper_bound = q3 + k * iqr

    # returning series 
    return s.apply(lambda x: max([x - upper_bound, 0]))

def add_upper_outlier_columns(df, k):
    '''
    Accepts dataframe and cutoff value. Returns datframe with a new column containing upper outlier data for every numeric column.
    '''
    # iterate through numeric data type columns
    for col in df.select_dtypes('number'):

        # create column that contains values produced by upper_outliers function
        df[col + '_upper_outliers'] = upper_outliers(df[col], k)

    # return df
    return df

In [171]:
def lower_outliers(s, k):
    '''
    Accepts series and cutoff value.
    If a value in the series is an lower outlier, it returns a number that represents how far above the value is from the lower bound
    or 0 if the number is not an outlier.
    '''
    # creating 2 variables that represent the 1st and 3rd quantile of the given series
    q1, q3 = s.quantile([.25, .75])

    # calculating IQR
    iqr = q3 - q1

    # calculating lower bound
    lower_bound = q1 - k * iqr

    # returning series 
    return s.apply(lambda x: max([lower_bound - x, 0]))

def add_lower_outlier_columns(df, k):
    '''
    Accepts dataframe and cutoff value. Returns datframe with a new column containing lower outlier data for every numeric column.
    '''
    # iterate through numeric data type columns
    for col in df.select_dtypes('number'):

        # create column that contains values produced by lower_outliers function
        df[col + '_lower_outliers'] = lower_outliers(df[col], k)

    # return df
    return df

### Using lemonade.csv dataset and focusing on continuous variables:

In [172]:
df = pd.read_csv('lemonade.csv')

df.head()

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales
0,1/1/17,Sunday,27.0,2.0,15,0.5,10
1,1/2/17,Monday,28.9,1.33,15,0.5,13
2,1/3/17,Tuesday,34.5,1.33,27,0.5,15
3,1/4/17,Wednesday,44.1,1.05,28,0.5,17
4,1/5/17,Thursday,42.4,1.0,33,0.5,18


### Use the IQR Range Rule and the upper and lower bounds to identify the lower outliers of each column of lemonade.csv, using the multiplier of 1.5. 

In [188]:
lowdf = df.copy()

lowdf = add_lower_outlier_columns(lowdf, 1.5)

lowdf.head()

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_lower_outliers,Rainfall_lower_outliers,Flyers_lower_outliers,Price_lower_outliers,Sales_lower_outliers
0,1/1/17,Sunday,27.0,2.0,15,0.5,10,0.0,0,0.0,0.0,0
1,1/2/17,Monday,28.9,1.33,15,0.5,13,0.0,0,0.0,0.0,0
2,1/3/17,Tuesday,34.5,1.33,27,0.5,15,0.0,0,0.0,0.0,0
3,1/4/17,Wednesday,44.1,1.05,28,0.5,17,0.0,0,0.0,0.0,0
4,1/5/17,Thursday,42.4,1.0,33,0.5,18,0.0,0,0.0,0.0,0


In [182]:
lowdf[lowdf.Temperature_lower_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_lower_outliers,Rainfall_lower_outliers,Flyers_lower_outliers,Price_lower_outliers,Sales_lower_outliers
364,12/31/17,Sunday,15.1,2.5,9,0.5,7,1.6,0,0.0,0.0,0


In [183]:
lowdf[lowdf.Rainfall_lower_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_lower_outliers,Rainfall_lower_outliers,Flyers_lower_outliers,Price_lower_outliers,Sales_lower_outliers


In [184]:
lowdf[lowdf.Flyers_lower_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_lower_outliers,Rainfall_lower_outliers,Flyers_lower_outliers,Price_lower_outliers,Sales_lower_outliers
324,11/21/17,Tuesday,47.0,0.95,-38,0.5,20,0.0,0,42.0,0.0,0


In [185]:
lowdf[lowdf.Price_lower_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_lower_outliers,Rainfall_lower_outliers,Flyers_lower_outliers,Price_lower_outliers,Sales_lower_outliers


In [186]:
lowdf[lowdf.Sales_lower_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_lower_outliers,Rainfall_lower_outliers,Flyers_lower_outliers,Price_lower_outliers,Sales_lower_outliers


### Question: Do these outliers make sense?

### Answer: 
- Temperature - Yes, winter time and although its very low, its not so low that its unforeseeable.
- Rainfall - N/A
- Flyers - No, can't have negative flyers
- Price - NA
- Sales - NA

### Question: Which outliers should be kept?

### Answer:
The 1 temperature outlier should be kept but not the flyers outlier.

### Use the IQR Range Rule and the upper and lower bounds to identify the upper outliers of each column of lemonade.csv, using the multiplier of 1.5. 

In [189]:
updf = df.copy()

updf = add_upper_outlier_columns(updf, 1.5)

updf.head()

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_upper_outliers,Rainfall_upper_outliers,Flyers_upper_outliers,Price_upper_outliers,Sales_upper_outliers
0,1/1/17,Sunday,27.0,2.0,15,0.5,10,0.0,0.7,0.0,0.0,0.0
1,1/2/17,Monday,28.9,1.33,15,0.5,13,0.0,0.03,0.0,0.0,0.0
2,1/3/17,Tuesday,34.5,1.33,27,0.5,15,0.0,0.03,0.0,0.0,0.0
3,1/4/17,Wednesday,44.1,1.05,28,0.5,17,0.0,0.0,0.0,0.0,0.0
4,1/5/17,Thursday,42.4,1.0,33,0.5,18,0.0,0.0,0.0,0.0,0.0


In [190]:
updf[updf.Temperature_upper_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_upper_outliers,Rainfall_upper_outliers,Flyers_upper_outliers,Price_upper_outliers,Sales_upper_outliers
41,2/11/17,Saturday,212.0,0.91,35,0.5,21,107.3,0.0,0.0,0.0,0.0


In [191]:
updf[updf.Rainfall_upper_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_upper_outliers,Rainfall_upper_outliers,Flyers_upper_outliers,Price_upper_outliers,Sales_upper_outliers
0,1/1/17,Sunday,27.0,2.0,15,0.5,10,0.0,0.7,0.0,0.0,0.0
1,1/2/17,Monday,28.9,1.33,15,0.5,13,0.0,0.03,0.0,0.0,0.0
2,1/3/17,Tuesday,34.5,1.33,27,0.5,15,0.0,0.03,0.0,0.0,0.0
5,1/6/17,Friday,25.3,1.54,23,0.5,11,0.0,0.24,0.0,0.0,0.0
6,1/7/17,Saturday,32.9,1.54,19,0.5,13,0.0,0.24,0.0,0.0,0.0
10,1/11/17,Wednesday,32.6,1.54,23,0.5,12,0.0,0.24,0.0,0.0,0.0
11,1/12/17,Thursday,38.2,1.33,16,0.5,14,0.0,0.03,0.0,0.0,0.0
12,1/13/17,Friday,37.5,1.33,19,0.5,15,0.0,0.03,0.0,0.0,0.0
15,1/16/17,Monday,30.6,1.67,24,0.5,12,0.0,0.37,0.0,0.0,0.0
16,1/17/17,Tuesday,32.2,1.43,26,0.5,14,0.0,0.13,0.0,0.0,0.0


In [193]:
updf[updf.Flyers_upper_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_upper_outliers,Rainfall_upper_outliers,Flyers_upper_outliers,Price_upper_outliers,Sales_upper_outliers
166,6/16/17,Friday,99.3,0.47,77,0.5,41,0.0,0.0,1.0,0.0,0.0
194,7/14/17,Friday,92.0,0.5,80,0.5,40,0.0,0.0,4.0,0.0,0.0


In [194]:
updf[updf.Price_upper_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_upper_outliers,Rainfall_upper_outliers,Flyers_upper_outliers,Price_upper_outliers,Sales_upper_outliers


In [195]:
updf[updf.Sales_upper_outliers > 0]

Unnamed: 0,Date,Day,Temperature,Rainfall,Flyers,Price,Sales,Temperature_upper_outliers,Rainfall_upper_outliers,Flyers_upper_outliers,Price_upper_outliers,Sales_upper_outliers
181,7/1/17,Saturday,102.9,0.47,59,0.5,143,0.0,0.0,0.0,0.0,98.0
182,7/2/17,Sunday,93.4,0.51,68,0.5,158,0.0,0.0,0.0,0.0,113.0
183,7/3/17,Monday,81.5,0.54,68,0.5,235,0.0,0.0,0.0,0.0,190.0
184,7/4/17,Tuesday,84.2,0.59,49,0.5,534,0.0,0.0,0.0,0.0,489.0


### Question: Do these outliers make sense?

### Answer: 
- Temperature - No, the temperature jumps from 50 to 212 and then back down to 55. This doesnt make sense.
- Rainfall - Yes
- Flyers - This was a substantial increase in flyers but it's reasonable to believe that the lemonade seller may have had help passing out fliers that day.
- Price - NA
- Sales - The outliers happened on the days leading up to and including July the 4th. The holiday could have helped increase sales.

### Question: Which outliers should be kept?

### Answer:
All except the temperature outlier. 

### Using the multiplier of 3, IQR Range Rule, and the lower and upper bounds, identify the outliers below the lower bound in each colum of lemonade.csv. 

### Do these lower outliers make sense?Which outliers should be kept?

### Using the multiplier of 3, IQR Range Rule, and the lower and upper bounds, identify the outliers above the upper_bound in each colum of lemonade.csv. 

### Do these upper outliers make sense? Which outliers should be kept?

### Identify if any columns in lemonade.csv are normally distributed. For normally distributed columns:

Use a 2 sigma decision rule to isolate the outliers.
Do these make sense?
Should certain outliers be kept or removed?
Now use a 3 sigma decision rule to isolate the outliers in the normally distributed columns from lemonade.csv