In [1]:
import numpy as np  #load up the libraries and object defs. we need
import pandas as pd
from pandas import DataFrame, Series

# load up my visualization system, and call the object plt
import matplotlib.pyplot as plt

# tell ipython notebook to print visualizations inline
%pylab
%matplotlib inline

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


For the Encyclopedia Sales dataset, 

In [None]:
df = pd.read_csv('sales.csv')
df = df.set_index('day')
df.head(10)

# Manually computing confidence intervals

We can compute exact confidence intervals using the mean and standard deviation

`ci = mean ± (t*std)`

Where `t` depends on what confidence interval we want.

In [None]:
t=1.96 #For 95% CIs

In [None]:
df.mean()

In [None]:
df.std()

In [None]:
sqn=sqrt(len(df))

In [None]:
cis = DataFrame({'mean':df.mean(), 'ci':t*df.std()/sqn})
cis

In [None]:
cis.plot(kind="bar", yerr="ci")

# Let's check if our data is normally distributed

First we pivot the dataframe using the `melt` command to put all of the values into one 'Sales' column. 

In [None]:
df_pivot = pd.melt(df,value_vars=['Seller 1','Seller 2','Seller 3','Seller 4','Seller 5','Seller 6'], 
                   var_name="Sellers", value_name="Sales")
df_pivot.tail(5)

Then we can use a [Q-Q plot](https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot) (a quantile-quantile plot) to see if we have a normal distribution.

To do that we plot the sales values against a randomly generated normally-distributed set of new values.

In [None]:
# http://scientificpythonsnippets.com/index.php/distributions/6-q-q-plot-in-python-to-test-if-data-is-normally-distributed

# get all of the sales values and plot them against a normally-distributed set of quantiles
sales = df_pivot["Sales"].copy()
sales.sort_values(inplace=True)
norm=random.normal(0,2,len(sales))
norm.sort()
plt.plot(norm,sales,"o")

# add a trend line and make it pretty
z = np.polyfit(norm,sales, 1)
p = np.poly1d(z)
plt.plot(norm,p(norm),"k--", linewidth=2)
plt.title("Normal Q-Q plot", size=28)
plt.xlabel("Theoretical quantiles", size=24)
plt.ylabel("Expreimental quantiles", size=24)
plt.tick_params(labelsize=16)


For comparison, we can also plot two normally distributed sets against one another.

In [None]:
norm2=random.normal(0,2,len(sales))
norm2.sort()
plt.plot(norm,norm2,"o")

If the data isn't normally distributed. Bootstrap CIs might be a better choice.

(A histogram could also have shown us this.)

In [None]:
sales.hist()

This data probably has a log-normal distribution. Which we can test by taking the log.

In [None]:
np.log(sales).hist()

...we can always reverse that by taking the exponent.

In [None]:
np.exp(np.log(sales)).hist()

If we wanted to, we could log-transform the data, compute CIs, then convert back before plotting.
This would give more accurate confidence intervals (you can try later if you'd like).
Instead, for now, we'll compute bootstrap confidence intervals.

# Bootstrap confidence intervals in Seaborn

The [Seaborn charting package](https://seaborn.github.io) supports bootstrap confidence intervals by default on most of it's chart types. 

If you don't already have seaborn installed, open a terminal or command prompt and run:
`conda install seaborn`

In [2]:
# Import seaborn
import seaborn as sns

**Draw a bar plot to show each seller.**

Use the `ci` parameter to set the confidence interval and the `n_boot` parameter to set the number of samples.

In [None]:
sns.set_style('whitegrid')
g = sns.barplot(x="Sellers", y="Sales", hue="Sellers", data=df_pivot,
                palette="muted", ci=95, n_boot=100)

**Try experimenting with different sized confidence intervals and different numbers of bootstrap samples.**

*How does this change the size of the confidence intervals?*
*What happens if you re-run multiple times?*

# Confidence intervals for pairwise comparisons
If we want to see if one seller reliably sells more than another seller on a daily basis, we can test that directly.

In [None]:
df_comp = DataFrame({"Seller 6 - 3":(df['Seller 6'] - df['Seller 3']),
                     "Seller 6 - 5":(df['Seller 6'] - df['Seller 5'])})
df_comp.head(5)

In [None]:
df_comp_pivot = pd.melt(df_comp,value_vars=['Seller 6 - 3','Seller 6 - 5'], 
                   var_name="Seller Pair", value_name="Difference in Sales")
df_comp_pivot.tail(5)

In [None]:
sns.factorplot(y="Seller Pair", x="Difference in Sales", data=df_comp_pivot,
               join=False,ci=95, n_boot=1000)
plt.axvline(0,lw=2,color="k")

# Now...
Have a look at the `sales-full.csv` dataset.
Are there any clear differences in it?

In [3]:
df = pd.read_csv('sales-full.csv')
df

Unnamed: 0,day,Seller 1,Seller 2,Seller 3,Seller 4,Seller 5,Seller 6
0,1,320,89,21,57,57,107
1,2,74,386,181,71,29,95
2,3,340,186,151,108,342,78
3,4,322,606,257,96,167,423
4,5,146,78,269,527,321,70
5,6,24,31,152,73,14,497
6,7,42,69,178,50,435,36
7,8,76,109,58,22,211,222
8,9,99,80,430,39,95,163
9,10,915,68,19,1092,257,27


**Question #1**
Are any of the encyclopedia sellers clearly different from the others in terms of average daily sales? Who would you give a raise?

**Question #2**
Are the weeks with the best average daily sales per seller clearly different from the weeks with the worst sales? 
*(Be sure to think about your unit of analysis.)*

In [4]:
df["Week"] = (df.day/7).apply(np.floor)
df.head(15)

Unnamed: 0,day,Seller 1,Seller 2,Seller 3,Seller 4,Seller 5,Seller 6,Week
0,1,320,89,21,57,57,107,0.0
1,2,74,386,181,71,29,95,0.0
2,3,340,186,151,108,342,78,0.0
3,4,322,606,257,96,167,423,0.0
4,5,146,78,269,527,321,70,0.0
5,6,24,31,152,73,14,497,0.0
6,7,42,69,178,50,435,36,1.0
7,8,76,109,58,22,211,222,1.0
8,9,99,80,430,39,95,163,1.0
9,10,915,68,19,1092,257,27,1.0


In [5]:
df_melted = pd.melt(df,id_vars=['Week'],value_vars=['Seller 1','Seller 2','Seller 3','Seller 4','Seller 5','Seller 6'], 
                   var_name="Seller", value_name="Sales")
df_melted.head(15)

Unnamed: 0,Week,Seller,Sales
0,0.0,Seller 1,320
1,0.0,Seller 1,74
2,0.0,Seller 1,340
3,0.0,Seller 1,322
4,0.0,Seller 1,146
5,0.0,Seller 1,24
6,1.0,Seller 1,42
7,1.0,Seller 1,76
8,1.0,Seller 1,99
9,1.0,Seller 1,915


In [6]:
df_melted = df_melted.groupby(["Week","Seller"],level=-1).mean()
df_melted = df_melted.reset_index()
df_melted.head(10)

Unnamed: 0,index,Week,Sales
0,0,0.0,320
1,1,0.0,74
2,2,0.0,340
3,3,0.0,322
4,4,0.0,146
5,5,0.0,24
6,6,1.0,42
7,7,1.0,76
8,8,1.0,99
9,9,1.0,915


In [None]:
sns.set_style('whitegrid')
g = sns.barplot(x="Week", y="Sales", data=df_melted,
                palette="muted", ci=95, n_boot=100)

**Question #3** {Question of your choice.}