In [1]:
import pandas as pd
import random
import numpy as np
import math

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
chickweight = pd.read_csv('chick_weight.csv')
chickweight = chickweight.drop(['Unnamed: 0'], axis=1)
chickweight_one_diet = chickweight[:][chickweight.Diet == 1]

In [4]:
def remove_missing_data_row(data):
    indices = data.index.values
    indices_to_remove = []
    for idx in indices:
        if data.loc[idx]["weight"] == 0:
            indices_to_remove.append(idx)
            
    data = data.drop(indices_to_remove)
    return data

In [5]:
def get_slope_from_data(df):
    X = np.array(df["Time"], dtype=float)
    Y = np.array(df["weight"], dtype=float)

    #least squares polynomial fit. Fit a polynomial of degree deg to points (x,y) np.polyfit(x, y, deg)
    z = np.polyfit(X, Y, 1)  
    
    #poly1d consructs the 1d polynomial from the given coefficients
    p = np.poly1d(z) 
    
    #linspace returns evenly spaced numbers over a specified interval np.linspace(start, stop, num_of_samples)
    xp = np.linspace(0, 22, 100)
    
    #Plotting
#     plt.plot(X,Y,'.',xp, p(xp),'-')
#     plt.show()
    return z[0]

def get_slope_from_x_y(X, Y):
    z = np.polyfit(X, Y, 1)
    p = np.poly1d(z)
    xp = np.linspace(0, 22, 100)
    return z[0]

def get_slope_diff_from_data(portion_1_data, portion_2_data):
    new_df_portion_1_values = portion_1_data.values
    new_df_portion_2_values = portion_2_data.values

    weight = 0
    time = 1
    
    X1 = np.array(new_df_portion_1_values[:,time], dtype=float)
    Y1 = np.array(new_df_portion_1_values[:,weight], dtype=float)
    X2 = np.array(new_df_portion_2_values[:,time], dtype=float)
    Y2 = np.array(new_df_portion_2_values[:,weight], dtype=float)
    
    return (get_slope_from_x_y(X1, Y1) - get_slope_from_x_y(X2, Y2))


In [6]:
def get_data_with_missing_values(data, portion_to_remove):
    # Randomly removing data to create data containing missing values
    # portion_to_remove: fraction of data to be removed, eg. 0.1, 0.2, etc.
    chickweight_with_missing_values = data.copy()
    Y = chickweight_with_missing_values["weight"]
    for i in range(len(Y)):
        if random.random()<portion_to_remove:
            Y[i] = 0  ##Randomly making few weights 0 to mimic missing data
    return chickweight_with_missing_values

In [7]:
def find_median(data):
    data = [x for x in data if x != 0]
    data.sort()
    if(len(data) % 2 != 0):
        mididx = math.floor(len(data)/2)
        median = data[mididx]
    else:
        mididx1 = math.floor(len(data)/2)
        mididx2 = math.floor(len(data)/2)-1
        median = (data[mididx1] + data[mididx2]) /2
    return median

def find_mean(data):
    count = 0
    for ele in data:
        if ele==0:
            continue
        else:
            count += 1
    return sum(data)/count
  
def replace(data, replacement):
    data_array = np.array(data)
    for idx in range(len(data_array)):
        if data_array[idx] ==0:
            data_array[idx] = replacement
        else:
            continue
    return data_array

In [8]:
def replace_missing_values(data_per_diet, replacement_parameter):
    new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet"])
    for i in range(22):
            temp_time = data_per_diet.loc[data_per_diet['Time'] == i]
            count_row = temp_time.shape[0]
            if(count_row==0):
                continue
            else:
                if(replacement_parameter == "mean"):
                    replacement = find_mean(temp_time["weight"])
                elif(replacement_parameter == "median"):
                    replacement = find_median(temp_time["weight"])
                temp_time["weight"] = replace(temp_time["weight"], replacement)
                new_df = pd.concat([new_df, temp_time], ignore_index=True)
    return new_df

In [9]:
def get_new_slope(data, replacement_parameter):
    if replacement_parameter == "remove_missing":
        temp_diet_new = remove_missing_data_row(data)
    else:
        temp_diet_new = replace_missing_values(data, replacement_parameter)
    slope = get_slope_from_data(temp_diet_new)
    return slope

In [10]:
def shuffle_missing_labels(data):
    new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet", "missing"])
    for i in range(22):
        temp = data.loc[data['Time']==i]
        count_row = temp.shape[0]
        if(count_row==0):
            continue
        else:
            temp['missing'] = np.random.permutation(temp['missing'].values)
            new_df = pd.concat([new_df, temp], ignore_index=True)
    return new_df

In [11]:
def significance_test_missing_data_slope(portion, replacement_parameter):

    portion1 = 0.0
    portion2 = portion
    
    chickweight_with_missing_values = get_data_with_missing_values(chickweight_one_diet, portion_to_remove=portion)
    chickweight_with_missing_values["missing"] = [portion]*len(chickweight_with_missing_values.index)
    
    chickweight_original = chickweight_one_diet.copy()
    chickweight_original["missing"] = [0.0]*len(chickweight_with_missing_values.index)
    
    observed_slope_diff = get_new_slope(chickweight_with_missing_values, replacement_parameter) -  get_slope_from_data(chickweight_original)

    
    data = pd.concat([chickweight_original,chickweight_with_missing_values])
        
    count = 0
    num_shuffles = 100
        
    for i in range(num_shuffles):
        if(i!=0 and i%10 ==0):
            print("Done: ", i)
        new_df = shuffle_missing_labels(data)

        new_df_portion_1 = new_df[:][new_df.missing == portion1]
        new_df_portion_2 = new_df[:][new_df.missing == portion2]
        
        slope_diff = get_slope_diff_from_data(new_df_portion_1,new_df_portion_2)   
        
        if observed_slope_diff < 0 and slope_diff <= observed_slope_diff:
            count += 1
        elif observed_slope_diff >= 0 and slope_diff >= observed_slope_diff:
            count += 1
    ######################################
    #
    # Output
    #
    ######################################
    print("**********Removing ", portion*100, "% of data****************")

    print ("Observed difference of two slopes: %.2f" % observed_slope_diff)
    print (count, "out of", num_shuffles, "experiments had a difference of two slopes", end=" ")
    if observed_slope_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_slope_diff, ".")
    print ("The chance of getting a difference of two slopes", end=" ")
    if observed_slope_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_slope_diff, "is %.4f"%(count / float(num_shuffles)), "\n")

In [12]:
portion_to_remove = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
for portion in portion_to_remove:
    significance_test_missing_data_slope(portion, "remove_missing")

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  10.0 % of data****************
Observed difference of two slopes: 0.18
39 out of 100 experiments had a difference of two slopes greater than or equal to 0.18 .
The chance of getting a difference of two slopes greater than or equal to 0.18 is 0.3900 

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  20.0 % of data****************
Observed difference of two slopes: -0.08
47 out of 100 experiments had a difference of two slopes less than or equal to -0.08 .
The chance of getting a difference of two slopes less than or equal to -0.08 is 0.4700 

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  30.0 % of data****************
Observed difference of two slopes: 0.47
35 out of 100 experiments had a difference of two slopes greater than or equal to 0.47 .
The chance 

In [13]:
for portion in portion_to_remove:
    significance_test_missing_data_slope(portion, "mean")

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  10.0 % of data****************
Observed difference of two slopes: -0.31
35 out of 100 experiments had a difference of two slopes less than or equal to -0.31 .
The chance of getting a difference of two slopes less than or equal to -0.31 is 0.3500 

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  20.0 % of data****************
Observed difference of two slopes: -0.32
31 out of 100 experiments had a difference of two slopes less than or equal to -0.32 .
The chance of getting a difference of two slopes less than or equal to -0.32 is 0.3100 

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  30.0 % of data****************
Observed difference of two slopes: -0.30
39 out of 100 experiments had a difference of two slopes less than or equal to -0.30 .
The chance of g

In [14]:
for portion in portion_to_remove:
    significance_test_missing_data_slope(portion, "median")

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  10.0 % of data****************
Observed difference of two slopes: -0.43
27 out of 100 experiments had a difference of two slopes less than or equal to -0.43 .
The chance of getting a difference of two slopes less than or equal to -0.43 is 0.2700 

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  20.0 % of data****************
Observed difference of two slopes: 0.13
47 out of 100 experiments had a difference of two slopes greater than or equal to 0.13 .
The chance of getting a difference of two slopes greater than or equal to 0.13 is 0.4700 

Done:  10
Done:  20
Done:  30
Done:  40
Done:  50
Done:  60
Done:  70
Done:  80
Done:  90
**********Removing  30.0 % of data****************
Observed difference of two slopes: 0.20
40 out of 100 experiments had a difference of two slopes greater than or equal to 0.20 .
The chance 