In [20]:
import numpy as np
import pandas as pd
from scipy.stats import norm as normal, binom
import bplot as bp
from scipy.optimize import minimize
from scipy.special import loggamma
import patsy

# October 23, 2019

In [6]:
def ll_normal(mu, X):
    d = X - mu
    return np.sum(d*d)

def optim(data, initval = None):
    return minimize(ll_normal, (initval if initval else np.random.normal()), args=(data), method="BFGS")["x"]

def bootstrap(data, R, fun, confidence=87):
    N = data.size
    thetas = np.full(R, np.nan)
    for r in range(R):
        idx = np.random.choice(N, N, replace=True)
        thetas[r] = fun(data[idx])
    cmin = (100-confidence)/2
    cmax = cmin + confidence
    return np.percentile(thetas, [cmin, cmax])

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/master/books.csv")

In [7]:
R=1001
bootstrap(df['uclaNew'],R,optim)

array([62.00123306, 82.55985302])

"We are \_\_% confident that the average new book price at UCLA is between \_\_ and \_\_ dollars"

## Paired Data

In [8]:
df.head()

Unnamed: 0,isbn,uclaNew,amazNew
0,978-0803272620,27.67,27.95
1,978-0030119194,40.59,31.14
2,978-0300080643,31.68,32.0
3,978-0226206813,16.0,11.52
4,978-0892365999,18.95,14.21


**Paired Data** is when you have each observation in your data set inherently tied to another observation in the dataset. For our current data set, we have two sets of prices both tied together by the ISBN.  
Despite being two different stores, they both reference the same book.

There is a 1-1 relation between every observation in the dataset. Paired by book, its the price of the book at different stores.

**EX**  
  1. Two measurements on opposites of a symmetric thing. E.g. anything with two arms, eyes, legs. But not "right arm" and "left leg"
  2. Two measurements accross one time point

In [11]:
df['diff'] = df['amazNew']-df['uclaNew'] # Create a new column to show the difference in price from one store to another
df.head()

Unnamed: 0,isbn,uclaNew,amazNew,diff
0,978-0803272620,27.67,27.95,0.28
1,978-0030119194,40.59,31.14,-9.45
2,978-0300080643,31.68,32.0,0.32
3,978-0226206813,16.0,11.52,-4.48
4,978-0892365999,18.95,14.21,-4.74


In [14]:
R=1001
np.random.seed(1234)
bootstrap(df['diff'],R,optim, confidence=87)

array([-15.54671116, -10.43493128])

"We are 87% confident that the average book for UCLA classes is between 10.43 and 15.55 dollars cheaper at Amazon"

## Two-Sample Means
We would like to compare means amongst variables that do not necessarily have the same number of observations.  
They don't have to have a 1-1 relation, and the don't have to have the same number of observations.

New Data Set: https://github.com/roualdes/data/blob/master/possum.txt

In [19]:
df = pd.read_csv("https://raw.githubusercontent.com/roualdes/data/master/possum.csv")
df.head()

Unnamed: 0,site,pop,sex,age,headL,skullW,totalL,tailL
0,1,Vic,m,8.0,94.1,60.4,89.0,36.0
1,1,Vic,f,6.0,92.5,57.6,91.5,36.5
2,1,Vic,f,6.0,94.0,60.0,95.5,39.0
3,1,Vic,f,6.0,93.2,57.1,92.0,38.0
4,1,Vic,f,2.0,91.5,56.3,85.5,36.0


On average, are the possums for Victoria longer or smaller than the possums within New South Wales or Queensland?

To acheive this, we find the overall mean for one and find the difference relative to the next group.

In [28]:
X = patsy.dmatrix("~ C(pop)", data=df)
#We want to make a design matrix that consists of an intercept with 104 1's for each row, and a new column that will
#pick up the difference between the two groups
# "~ C(pop)" Build me a catagorical list based on the parameter pop


X[:-5, :]

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.