# Basic Statistic Testing

## Hypothesis Testing, Statistical significance and using scipy in python
### Hypothesis Testing is the core of data analysis experimentation

In [1]:
# The goal of hypothesis testing is to determine if the two conditions we have in a experiment has resulted in
# different impacts

import pandas as pd
import numpy as np

# to bring libraries from the scipy module
from scipy import stats

In [4]:
# Scipy has interesting and important libraries needed in the field of data science, and almost all of these 
# libraries are used

# In hypothesis testing, we there are two statements of interests, first is the actual explanation (known as the 
# alternative explanation) and the other one is called the null hypothesis where we see if the explanation we have is 
# not sufficient. In hypothesis testing we see if the null hypothesis is true or not.


df = pd.read_csv("data/grades.csv")
df.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000


In [14]:
# if there is a difference between the actual explanation (alternative) and the null hypothesis then we take the
# alternative explanation as the answer.

print(f'There are {df.shape[0]} rows and {df.shape[1]} columns in the dataframe')

# dataframe_object.shape gives us the shape of the dataframe in rows and columns
df.ndim    # .ndim gives us the dimension of the dataframe

There are 2315 rows and 13 columns in the dataframe


2

In [21]:
early_finishers = df[pd.to_datetime(df['assignment1_submission']) < '2016']
early_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
0,B73F2C11-70F0-E37D-8B10-1D20AFED50B1,92.733946,2015-11-02 06:55:34.282000000,83.030552,2015-11-09 02:22:58.938000000,67.164441,2015-11-12 08:58:33.998000000,53.011553,2015-11-16 01:21:24.663000000,47.710398,2015-11-20 13:24:59.692000000,38.168318,2015-11-22 18:31:15.934000000
1,98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1,86.790821,2015-11-29 14:57:44.429000000,86.290821,2015-12-06 17:41:18.449000000,69.772657,2015-12-10 08:54:55.904000000,55.098125,2015-12-13 17:32:30.941000000,49.588313,2015-12-19 23:26:39.285000000,44.629482,2015-12-21 17:07:24.275000000
4,5ECBEEB6-F1CE-80AE-3164-E45E99473FB4,64.8138,2015-12-13 17:06:10.750000000,51.49104,2015-12-14 12:25:12.056000000,41.932832,2015-12-29 14:25:22.594000000,36.929549,2015-12-28 01:29:55.901000000,33.236594,2015-12-29 14:46:06.628000000,33.236594,2016-01-05 01:06:59.546000000
5,D09000A0-827B-C0FF-3433-BF8FF286E15B,71.647278,2015-12-28 04:35:32.836000000,64.05255,2016-01-03 21:05:38.392000000,64.75255,2016-01-07 08:55:43.692000000,57.467295,2016-01-11 00:45:28.706000000,57.467295,2016-01-11 00:54:13.579000000,57.467295,2016-01-20 19:54:46.166000000
8,C9D51293-BD58-F113-4167-A7C0BAFCB6E5,66.595568,2015-12-25 02:29:28.415000000,52.916454,2015-12-31 01:42:30.046000000,48.344809,2016-01-05 23:34:02.180000000,47.444809,2016-01-02 07:48:42.517000000,37.955847,2016-01-03 21:27:04.266000000,37.955847,2016-01-19 15:24:31.060000000


In [34]:
late_finishers = df[~df.index.isin(early_finishers.index)]    # '~' beofre the statement indicates that false values 
# are to be taken, that is the opposite of how the current statement works  (Known as INVERSE)
late_finishers.head()

Unnamed: 0,student_id,assignment1_grade,assignment1_submission,assignment2_grade,assignment2_submission,assignment3_grade,assignment3_submission,assignment4_grade,assignment4_submission,assignment5_grade,assignment5_submission,assignment6_grade,assignment6_submission
2,D0F62040-CEB0-904C-F563-2F8620916C4E,85.512541,2016-01-09 05:36:02.389000000,85.512541,2016-01-09 06:39:44.416000000,68.410033,2016-01-15 20:22:45.882000000,54.728026,2016-01-11 12:41:50.749000000,49.255224,2016-01-11 17:31:12.489000000,44.329701,2016-01-17 16:24:42.765000000
3,FFDF2B2C-F514-EF7F-6538-A6A53518E9DC,86.030665,2016-04-30 06:50:39.801000000,68.824532,2016-04-30 17:20:38.727000000,61.942079,2016-05-12 07:47:16.326000000,49.553663,2016-05-07 16:09:20.485000000,49.553663,2016-05-24 12:51:18.016000000,44.598297,2016-05-26 08:09:12.058000000
6,3217BE3F-E4B0-C3B6-9F64-462456819CE4,87.498744,2016-03-05 11:05:25.408000000,69.998995,2016-03-09 07:29:52.405000000,55.999196,2016-03-16 22:31:24.316000000,50.399276,2016-03-18 07:19:26.032000000,45.359349,2016-03-19 10:35:41.869000000,45.359349,2016-03-23 14:02:00.987000000
7,F1CB5AA1-B3DE-5460-FAFF-BE951FD38B5F,80.57609,2016-01-24 18:24:25.619000000,72.518481,2016-01-27 13:37:12.943000000,65.266633,2016-01-30 14:34:36.581000000,65.266633,2016-02-03 22:08:49.002000000,65.266633,2016-02-16 14:22:23.664000000,65.266633,2016-02-18 08:35:04.796000000
9,E2C617C2-4654-622C-AB50-1550C4BE42A0,59.270882,2016-03-06 12:06:26.185000000,59.270882,2016-03-13 02:07:25.289000000,53.343794,2016-03-17 07:30:09.241000000,53.343794,2016-03-20 21:45:56.229000000,42.675035,2016-03-27 15:55:04.414000000,38.407532,2016-03-30 20:33:13.554000000


In [36]:
# there are many other ways to get the late_finishers data
# either by usinng join, using a function and then apply method

# pandas data frame object has a variety of statistical methods

print(early_finishers['assignment1_grade'].mean())
print(late_finishers['assignment1_grade'].mean())

74.94728457024303
74.0450648477065


In [38]:
# to see if the above numbers are same or similar and what similar actually means, to find all of this student
# t-tests are used. The t-tests allows us to form an alternative hypothesis and a null hypothesis. In this case
# it would be whether the means are different, or not. and to test the null hypothesis.

### While doing hypothesis testing we have to choose a 'significance level of threshold' for how much of a chance we are willing to accept. This threshold is generally known as the 'aplha'. In this example, aplha of 0.05 (5%) is used.

In [42]:
# The SciPy library contains a number of statistical tests and forms a basis for hypothesis testing in Python.
# in this example, the function 'ttest_ind()' is used which performs t-test independently, that is, the populations
# are not related to one another.

# ttest_index() function's result returns 't-statistic' and 'p-value'
# Here 'p-value', that is the probability, is most important as it gives the chance of the null hypothesis being True
# (this is between 1 and 0)

stats.ttest_ind(early_finishers['assignment1_grade'], late_finishers['assignment1_grade'], equal_var=True)

# here the null hypothesis is rejected as it is greater than 0.05 (0.18 > 0.05)

Ttest_indResult(statistic=1.322354085372139, pvalue=0.18618101101714551)

In [44]:
print(stats.ttest_ind(early_finishers['assignment2_grade'], late_finishers['assignment2_grade'], equal_var=True))
print(stats.ttest_ind(early_finishers['assignment3_grade'], late_finishers['assignment3_grade'], equal_var=True))
print(stats.ttest_ind(early_finishers['assignment4_grade'], late_finishers['assignment4_grade'], equal_var=True))
print(stats.ttest_ind(early_finishers['assignment5_grade'], late_finishers['assignment5_grade'], equal_var=True))
print(stats.ttest_ind(early_finishers['assignment6_grade'], late_finishers['assignment6_grade'], equal_var=True))

Ttest_indResult(statistic=1.2514717608216366, pvalue=0.2108889627004424)
Ttest_indResult(statistic=1.6133726558705392, pvalue=0.10679998102227865)
Ttest_indResult(statistic=0.049671157386456125, pvalue=0.960388729789337)
Ttest_indResult(statistic=-0.05279315545404755, pvalue=0.9579012739746492)
Ttest_indResult(statistic=-0.11609743352612056, pvalue=0.9075854011989656)


In [45]:
# The other two techniques 'CONFIDENCE INTERVALUES' and 'BAYESIAN ANALYSES' are used more frequently/regularly than
# the p-values as they are deemed to be less sufficient to tell us about the interactions that are happening

In [47]:
df1 = pd.DataFrame([np.random.random(100) for x in range(100)])   # List Comprehension
df2 = pd.DataFrame([np.random.random(100) for x in range(100)])   # List comprehension

df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.737742,0.814962,0.640398,0.270835,0.019593,0.297571,0.245998,0.154694,0.971217,0.444036,...,0.69078,0.108613,0.731652,0.903148,0.55923,0.57932,0.568704,0.113806,0.795,0.655051
1,0.480616,0.982333,0.112794,0.644915,0.95518,0.125455,0.369186,0.620408,0.909985,0.501339,...,0.994616,0.475779,0.811612,0.161141,0.61008,0.27346,0.539704,0.186871,0.390267,0.273291
2,0.965802,0.51001,0.957409,0.036173,0.482298,0.547375,0.909192,0.163311,0.361668,0.391502,...,0.367315,0.645841,0.158626,0.849448,0.8571,0.025249,0.147301,0.048466,0.031331,0.418077
3,0.630739,0.837152,0.533006,0.298975,0.440949,0.185318,0.779071,0.592798,0.607459,0.437164,...,0.285946,0.933572,0.969268,0.497714,0.277203,0.628701,0.325656,0.625722,0.846777,0.944855
4,0.937728,0.775191,0.895909,0.398394,0.700462,0.388664,0.497986,0.767387,0.201803,0.079042,...,0.083156,0.45991,0.933277,0.689044,0.268548,0.922013,0.490621,0.476418,0.713129,0.507144


In [49]:
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.11677,0.151784,0.694046,0.864869,0.662122,0.655894,0.05157,0.761789,0.940058,0.77841,...,0.661513,0.983479,0.705985,0.489412,0.431053,0.243748,0.369409,0.251594,0.714185,0.258346
1,0.994273,0.673079,0.171347,0.103016,0.308203,0.688874,0.690804,0.269502,0.771136,0.233186,...,0.251847,0.573503,0.296776,0.88143,0.926267,0.787446,0.552799,0.687244,0.27016,0.836611
2,0.86671,0.728005,0.854637,0.01504,0.321054,0.468848,0.843314,0.694922,0.223176,0.39403,...,0.795449,0.750958,0.865581,0.264349,0.660378,0.785411,0.932063,0.360363,0.138483,0.181517
3,0.240954,0.972644,0.363679,0.711952,0.908716,0.223679,0.714073,0.880197,0.414422,0.368392,...,0.258303,0.425435,0.247567,0.777389,0.158074,0.157273,0.169306,0.552425,0.757878,0.156713
4,0.322036,0.187015,0.539308,0.289079,0.512673,0.32649,0.31823,0.476281,0.574689,0.104588,...,0.96974,0.302199,0.080677,0.949355,0.7317,0.056254,0.956928,0.240499,0.368092,0.209137


In [53]:
# For a given row in df1, is it the same as the row in df2?

def test_columns(alpha=0.1):
    
    num_diff = 0
    
    for col in df1.columns:
        
        teststat, pvalue = stats.ttest_ind(df1[col], df2[col], equal_var=True)
        # to see if the pvalue is less than the alpha
        if pvalue <= alpha:
            print(f'Column {col} is statistically different at aplha = {alpha}, pval = {pvalue}')
            num_diff += 1
# to print summary stats
    (print(f'Total number of columns that were different {num_diff} which was {num_diff/len(df.columns)} %'))

test_columns()

Column 12 is statistically different at aplha = 0.1, pval = 0.05542590297232559
Column 20 is statistically different at aplha = 0.1, pval = 0.07819918991068621
Column 23 is statistically different at aplha = 0.1, pval = 0.08218997514350773
Column 44 is statistically different at aplha = 0.1, pval = 0.009375591011109966
Column 49 is statistically different at aplha = 0.1, pval = 0.04393879638772188
Column 56 is statistically different at aplha = 0.1, pval = 0.043845834943461726
Column 65 is statistically different at aplha = 0.1, pval = 0.09671997185560195
Column 82 is statistically different at aplha = 0.1, pval = 0.0425101762416473
Column 89 is statistically different at aplha = 0.1, pval = 0.05629062478349562
Total number of columns that were different 9 which was 0.6923076923076923 %


# IF THE ALPHA IS LARGER, THE HYPOTHESIS IS TESTED LESS CAREFULLY

# THE P-VALUE RETURNS THE PROBABILITY OF THE DATA RESIDING IN THE NULL HYPOTHESIS, AND IF THE P-VALUE IS LARGE, THAT IS CLOSER TO 1, WE HAVE LESS EVIDENCE TO REJECT THE NULL HYPOTHESIS (HENCE WE TAKE THE NULL HYPOTHESIS AS THE ANSWER)

In [111]:
def result():
    s = 'ACAABAACAAABACDBADDDFSDDDFFSSSASDAFAAACBAAAFASD'

    result = []
    # compete the pattern below
    pattern = '(\S)[?=A]{3}'
    for item in re.finditer(pattern, s):
        for i in item.group():
            if i is 'A': continue
            result.append(i)
    return result

result()

['C', 'F', 'B']