In [None]:
from scipy import stats
import numpy as np
import pandas as pd

# Z-Test

In [None]:
drone = pd.read_csv("Drone_dist.csv")
drone.head()

Unnamed: 0,Dist
0,269
1,276
2,258
3,260
4,274


In [None]:
def ztest(pop, sample_size = 50, percent = 5, tail = 2, tail_region = "u", verbose = True):
    samp = pop.sample(n = sample_size)
    sampmean, popmean = samp.mean(), pop.mean()
    pop_std = pop.std()
    
    z = (sampmean - popmean)/(pop_std/np.sqrt(sample_size))

    qval = percent/100
    #Percent-Point Function, returns a discrete value that is less than or equal to the given probability.
    if (tail == 2):
      ppf = np.abs(stats.norm.ppf(q=(qval/2)))
      z = np.abs(z)
    else:
      if tail_region == "l":
        ppf = stats.norm.ppf(q=qval)
      else:
        ppf = stats.norm.ppf(q=1-qval)

    if verbose:
      print("Z value: {} \nTable value: {} ".format(z, ppf))
      if(tail_region == "u" or tail == 2):
        if(z < ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")
      else:
        if(z > ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")

    return z, ppf

In [None]:
ztest(drone.Dist, percent = 5)

Z value: 1.0190142151152326 
Table value: 1.9599639845400545 
Accept NUll Hypothesis.


(1.0190142151152326, 1.9599639845400545)

# T-Test

**Formula:-**

T = m1 - m2/sqrt(var1/n1 +var2/n2)

T = sampmean - popmean/(std/sqrt(n))

In [None]:
class T_test:
  def dependent(self, col1, col2, percent = 10, tail = 2, tail_region = "u", verbose = True):
    self.diff = col1 - col2
    self.n = self.diff.count()
    self.d = self.n - 1

    self.t = (self.diff.mean()/(self.diff.std()/np.sqrt(self.n)))
    
    self.qval = percent/100
    if (tail == 2):
      self.ppf = np.abs(stats.t.ppf(q=(self.qval/2), df=self.d))
      self.t = np.abs(self.t)
    else:
      if tail_region == "l":
        self.ppf = stats.t.ppf(q=self.qval, df=self.d)
      else:
        self.ppf = stats.t.ppf(q=1-self.qval, df=self.d)

    if verbose:
      print("T value: {} \nDf: {} \nTable value: {} ".format(self.t, self.d, self.ppf))
      if(tail_region == "u" or tail == 2):
        if(self.t < self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")
      else:
        if(self.t > self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")

    return self.t, self.d, self.ppf

  def independent(self, col1, col2, percent = 10, tail = 2, tail_region = "u", verbose = True):
    self.m1, self.m2 = col1.mean(), col2.mean()
    self.var1, self.var2 = col1.var(), col2.var()
    self.n1, self.n2 = col1.count(), col2.count()

    self.d = (self.n1 + self.n2) - 2
    self.sp2 = ((self.n1*self.var1) + (self.n2*self.var2))/self.d
    self.t = ((self.m1 - self.m2)/(np.sqrt((self.sp2/self.n1) + (self.sp2/self.n2))))
    
    self.qval = percent/100
    if (tail == 2):
      self.ppf = np.abs(stats.t.ppf(q=(self.qval/2), df=self.d))
      self.t = np.abs(self.t)
    else:
      if tail_region == "l":
        self.ppf = stats.t.ppf(q=self.qval, df=self.d)
      else:
        self.ppf = stats.t.ppf(q=1-self.qval, df=self.d)

    if verbose:
      print("T value: {} \nDf: {} \nTable value: {} ".format(self.t, self.d, self.ppf))
      if(tail_region == "u" or tail == 2):
        if(self.t < self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")
      else:
        if(self.t > self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")

    return self.t, self.d, self.ppf


  def one_samp(self, pop, sample_size = 30, percent = 10, tail = 2, tail_region = "u", verbose = True):
    self.samp = pop.sample(n = sample_size)
    self.sampmean, self.popmean = self.samp.mean(), pop.mean()
    self.samp_std = self.samp.std()
    self.n = self.samp.count()
    
    self.t = (self.sampmean - self.popmean)/(self.samp_std / np.sqrt(self.n))
    self.d = self.n-1

    self.qval = percent/100
    if (tail == 2):
      self.ppf = np.abs(stats.t.ppf(q=(self.qval/2), df=self.d))
      self.t = np.abs(self.t)
    else:
      if tail_region == "l":
        self.ppf = stats.t.ppf(q=self.qval, df=self.d)
      else:
        self.ppf = stats.t.ppf(q=1-self.qval, df=self.d)

    if verbose:
      print("T value: {} \nDf: {} \nTable value: {} ".format(self.t, self.d, self.ppf))
      if(tail_region == "u" or tail == 2):
        if(self.t < self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")
      else:
        if(self.t > self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")

    return self.t, self.d, self.ppf
    


In [None]:
df1 = pd.read_csv("Drone_dist.csv")
dep = df1.sample(n = 20)
dep.head()

Unnamed: 0,Dist
27,268
23,268
29,262
55,286
57,280


In [None]:
t = T_test()
t.one_samp(df1.Dist)


T value: 1.4602816554305207 
Df: 29 
Table value: 1.6991270265334977 
Accept NUll Hypothesis.


(1.4602816554305207, 29, 1.6991270265334977)

In [None]:
golf = pd.read_csv("Golf.csv")
golf.head()

Unnamed: 0,Current,New
0,264,277
1,261,269
2,267,263
3,272,266
4,258,262


**Current** - *Driving distances of golf balls without coating*

**New** - *Driving distances of golf balls with coating*

**H0** : No improvement on driving distances of golf balls on coating

**H1** : Improvement on driving distances of golf balls on coating

In [None]:
t.independent(golf.Current, golf.New)

T value: 1.3116519774972133 
Df: 78 
Table value: 1.6646246444385244 
Accept NUll Hypothesis.


(1.3116519774972133, 78, 1.6646246444385244)

In [None]:
t.dependent(golf.Current, golf.New)

T value: 1.2769699827911767 
Df: 39 
Table value: 1.6848751194974 
Accept NUll Hypothesis.


(1.2769699827911767, 39, 1.6848751194974)

# F-Test

In [None]:
df = pd.read_csv("Golf.csv")
df = df.sample(12)

In [None]:
class F_test:

  def ftest(self, col1, col2, percent = 10, tail = 2, tail_region = "u", verbose = True):
    self.var1, self.var2 = col1.var(), col2.var()
    self.n1, self.n2 = col1.count(), col2.count()
    

    if self.var1>=self.var2:
      self.dfn = self.n1 - 1
      self.dfd = self.n2 - 1
      self.F = self.var1/self.var2
    elif self.var1<self.var2:
      self.dfn = self.n2 - 1
      self.dfd = self.n1 - 1
      self.F = self.var2/self.var1

    self.qval = percent/100
    if (tail == 2):
      self.ppf = np.abs(stats.f.ppf(q=(self.qval/2), dfn=self.dfn, dfd=self.dfd))
      self.F = np.abs(self.F)
    else:
      if tail_region == "l":
        self.ppf = stats.f.ppf(q=self.qval, dfn=self.dfn, dfd=self.dfd)
      else:
        self.ppf = stats.f.ppf(q=1-self.qval, dfn=self.dfn, dfd=self.dfd)

    if verbose:
      print("F value: {} \nDfn: {} \nDfd: {} \nTable value: {} ".format(self.F, self.dfn, self.dfd, self.ppf))
      if(tail_region == "u" or tail == 2):
        if(self.F < self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")
      else:
        if(self.F > self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")

    return self.F, self.dfn, self.dfd, self.ppf

In [None]:
f = F_test()
f.ftest(df.Current, df.New)

F value: 1.2341862682771776 
Dfn: 11 
Dfd: 11 
Table value: 0.3548703598838791 
Reject NUll Hypothesis.


(1.2341862682771776, 11, 11, 0.3548703598838791)

# ANOVA

In [None]:
class ANOVA:

  def one_anova(self, *cols, percent = 10, tail = 1, tail_region = "u", verbose = True):
    self.anova = pd.DataFrame(index=("Between", "Within", "Total"))
    self.T = []
    self.N = 0
    self.x2 = 0
    for col in cols:
        self.T.append(col.sum())
        self.N += col.count()
        self.x2 += np.square(col).sum()

    self.t2byn = np.square(self.T).mean()
    self.ssbetween = (self.t2byn-(np.square(np.sum(self.T))/self.N))
    self.sswithin = (self.x2 - self.t2byn)
    self.sstotal = self.ssbetween + self.sswithin

    self.dftotal = self.N - 1
    self.dfbetween = len(cols) - 1
    self.dfwithin = self.N - len(cols)

    self.msbetween = self.ssbetween/self.dfbetween
    self.mswithin = self.sswithin/self.dfwithin
    
    if self.msbetween>=self.mswithin:
      self.dfn = self.dfbetween
      self.dfd = self.dfwithin
      self.F = self.msbetween/self.mswithin
    elif self.msbetween<self.mswithin:
      self.dfn = self.dfwithin
      self.dfd = self.dfbetween
      self.F = self.mswithin/self.msbetween

    self.qval = percent/100
    if (tail == 2):
      self.ppf = np.abs(stats.f.ppf(q=(self.qval/2), dfn=self.dfn, dfd=self.dfd))
      self.F = np.abs(self.F)
    else:
      if tail_region == "l":
        self.ppf = stats.f.ppf(q=self.qval, dfn=self.dfn, dfd=self.dfd)
      else:
        self.ppf = stats.f.ppf(q=1-self.qval, dfn=self.dfn, dfd=self.dfd)

    if verbose:
      self.anova["SS"] = self.ssbetween, self.sswithin, self.sstotal
      self.anova["df"] = self.dfbetween, self.dfwithin, self.dftotal
      self.anova["MS"] = self.msbetween, self.mswithin, "-"
      self.anova["F"] = "-", self.F, "-"
      self.anova["Table Value"] = "-", self.ppf, "-"
      display(self.anova)
      if(tail_region == "u" or tail == 2):
        if(self.F < self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")
      else:
        if(self.F > self.ppf):
          print("Accept NUll Hypothesis.")
        else:  
          print("Reject NUll Hypothesis.")
      
    return self.F, self.dfn, self.dfd, self.ppf

In [None]:
anova = pd.read_csv("anova1.csv")
anova.head()

Unnamed: 0,g1,g2,g3
0,0,3,6
1,4,6,8
2,2,6,10


In [None]:
a = ANOVA()
a.one_anova(anova.g1, anova.g2, anova.g3,percent=5)

Unnamed: 0,SS,df,MS,F,Table Value
Between,54.0,2,27.0,-,-
Within,22.0,6,3.666667,7.363636,5.143253
Total,76.0,8,-,-,-


Reject NUll Hypothesis.


(7.363636363636364, 2, 6, 5.143252849784718)

# Table Value

In [None]:
df = 6
# table value for left tail at 1%
print("Table value for Left tail at 1%: ",stats.t.ppf(q=0.1, df=df))
# table value for right tail at 1%
print("Table value for Right tail at 1%: ",-stats.t.ppf(q=0.1, df=df))
# table value for right tail at 1%
print("Table value for Two tail at 1%: ",np.abs(stats.t.ppf(q=0.1/2, df=df)))

Table value for Left tail at 1%:  -1.4397557472577693
Table value for Right tail at 1%:  1.4397557472577693
Table value for Two tail at 1%:  1.943180280392782


In [None]:
stats.f.ppf(q=1-0.05,dfn = 2, dfd = 6)

5.143252849784718

In [None]:
def two_anova(*cols, percent = 10, tail = 1, tail_region = "u", verbose = True):
    anova = pd.DataFrame(index=("Between", "Within", "Subject", "Error", "Total"))
    T = []
    N = 0
    x2 = 0
    s = cols[0]*0
    for col in cols:
        T.append(col.sum())
        N += col.count()
        x2 += np.square(col).sum()
        s += col


    g2byn = np.square(np.sum(T))/N
    t2byn = np.square(T).mean()
    ssbetween = (t2byn-g2byn)
    sssubject = (np.square(s).mean()-g2byn)
    sswithin = (x2 - t2byn)
    sserror = sswithin - sssubject
    sstotal =  x2 - g2byn

    dftotal = N - 1
    dfbetween = len(cols) - 1
    dfwithin = N - len(cols)
    dfsubject = len(s) - 1
    dferror = np.abs(dfwithin - dfsubject)

    msbetween = ssbetween/dfbetween
    mserror = sserror/dferror

    if msbetween>=mserror:
        dfn = dfbetween
        dfd = dferror
        F = msbetween/mserror
    elif msbetween<mserror:
        dfn = dferror
        dfd = dfbetween
        F = mserror/msbetween

    qval = percent/100
    if (tail == 2):
        ppf = np.abs(stats.f.ppf(q=(qval/2), dfn=dfn, dfd=dfd))
        F = np.abs(F)
    else:
        if tail_region == "l":
            ppf = stats.f.ppf(q=qval, dfn=dfn, dfd=dfd)
        else:
            ppf = stats.f.ppf(q=1-qval, dfn=dfn, dfd=dfd)

    if verbose:
        anova["SS"] = ssbetween, sswithin, sssubject, sserror, sstotal
        anova["df"] = dfbetween, dfwithin, dfsubject, dferror, dftotal
        anova["MS"] = msbetween, "-", "-", mserror, "-"
        anova["F"] = F, "-", "-", "-", "-"
        anova["Table Value"] = ppf, "-", "-", "-", "-"
        display(anova)
        if(tail_region == "u" or tail == 2):
            if(F < ppf):
                print("Accept NUll Hypothesis.")
            else:  
                print("Reject NUll Hypothesis.")
        else:
            if(F > ppf):
                print("Accept NUll Hypothesis.")
            else:  
                print("Reject NUll Hypothesis.")
        
    return F, dfn, dfd, ppf

In [None]:
two_anova(anova.g1, anova.g2, anova.g3)

Unnamed: 0,SS,df,MS,F,Table Value
Between,54.0,2,27.0,27.0,4.324555
Within,22.0,6,-,-,-
Subject,18.0,2,-,-,-
Error,4.0,4,1.0,-,-
Total,76.0,8,-,-,-


Reject NUll Hypothesis.


(27.0, 2, 4, 4.32455532033676)

In [None]:
a = pd.read_csv("anova3.csv")
a.head()


Unnamed: 0,row,column,inter,value
0,r1,g1,D,8
1,r1,g1,D,8
2,r2,g1,ND,9
3,r2,g1,ND,11
4,r1,g2,D,8


In [None]:

def three_anova(data, row, column, interaction, value, percent = 10, tail = 1, tail_region = "u", verbose = True):
    anova = pd.DataFrame(index=("Column", "Row", "Interaction", "Within", "Total"))
   
    tcellbyn = (np.square(data.groupby(by=[interaction, column]).sum().reset_index()[value])/data.groupby(by=[interaction, column]).count().reset_index()[value]).sum()
    tcolumn2byn = (np.square(data.groupby(by=[column]).sum().reset_index()[value])/data.groupby(by=[column]).count().reset_index()[value]).sum()
    trow2byn = (np.square(data.groupby(by=[row]).sum().reset_index()[value])/data.groupby(by=[row]).count().reset_index()[value]).sum()
    
    G = data.value.sum()
    N = data.value.count()
    g2byn = (G**2)/N
    x2 = np.square(data.value).sum()
    
    sstotal = x2 - g2byn
    ssbetween = tcellbyn - g2byn
    sswithin = x2 - tcellbyn
    sscolumn = tcolumn2byn - g2byn
    ssrow = trow2byn - g2byn
    ssinteraction = ssbetween - (sscolumn + ssrow)

    dftotal = N - 1
    dfcolumn = len(pd.unique(data[column])) - 1
    dfrow = len(pd.unique(data[row])) - 1
    dfinteraction = dfcolumn * dfrow
    dfwithin = N - ((dfcolumn+1) * (dfrow+1))

    mscolumn = sscolumn/dfcolumn
    msrow = ssrow/dfrow
    msinteraction = ssinteraction/dfinteraction
    mswithin = sswithin/dfwithin

    fcolumn, columnppf = ftest(mscolumn, mswithin, dfcolumn, dfrow, "F-Column", percent, tail, tail_region, verbose)
    frow, rowppf = ftest(msrow, mswithin, dfrow, dfrow, "F-Row", percent, tail, tail_region, verbose)
    finteraction, interactionppf = ftest(msinteraction, mswithin, dfinteraction, dfwithin, "F-Interaction", percent, tail, tail_region, verbose)
    
    if verbose:
        anova["SS"] = sscolumn, ssrow, ssinteraction, sswithin, sstotal
        anova["df"] = dfcolumn, dfrow, dfinteraction, dfwithin, dftotal
        anova["MS"] = mscolumn, msrow, msinteraction, mswithin, "-"
        anova["F"] = fcolumn, frow, finteraction,"-", "-"
        anova["Table Value"] = columnppf, rowppf, interactionppf, "-", "-"
        display(anova)

    return fcolumn, frow, finteraction, columnppf, rowppf, interactionppf, dfcolumn, dfrow, dfinteraction
        

def ftest(msx, msy, dfx, dfy, title, percent, tail, tail_region, verbose):
    if msx>=msy:
        dfn = dfx
        dfd = dfy
        F = msx/msy
    elif msx<msy:
        dfn = dfy
        dfd = dfx
        F = msy/msx

    qval = percent/100
    if (tail == 2):
        ppf = np.abs(stats.f.ppf(q=(qval/2), dfn=dfn, dfd=dfd))
        F = np.abs(F)
    else:
        if tail_region == "l":
            ppf = stats.f.ppf(q=qval, dfn=dfn, dfd=dfd)
        else:
            ppf = stats.f.ppf(q=1-qval, dfn=dfn, dfd=dfd)

    if verbose:
        print(title + ":-")
        if(tail_region == "u" or tail == 2):
            if(F < ppf):
                print("Accept NUll Hypothesis.")
            else:  
                print("Reject NUll Hypothesis.")
        else:
            if(F > ppf):
                print("Accept NUll Hypothesis.")
            else:  
                print("Reject NUll Hypothesis.")
            

    return F, ppf


three_anova(a, "row", "column", "inter", "value")

F-Column:-
Accept NUll Hypothesis.
F-Row:-
Accept NUll Hypothesis.
F-Interaction:-
Reject NUll Hypothesis.


Unnamed: 0,SS,df,MS,F,Table Value
Column,72.0,2,36.0,6.75,49.5
Row,192.0,1,192.0,36.0,39.863458
Interaction,56.0,2,28.0,5.25,3.463304
Within,32.0,6,5.333333,-,-
Total,352.0,11,-,-,-


(6.75,
 36.0,
 5.25,
 49.50000000000005,
 39.86345818906144,
 3.4633040700956514,
 2,
 1,
 2)