In [1]:
import pandas as pd
import random
# random.seed(10000)
import numpy as np
import math

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
chickweight = pd.read_csv('chick_weight.csv')
chickweight = chickweight.drop(['Unnamed: 0'], axis=1)
chickweight_one_diet = chickweight[:][chickweight.Diet == 1]

In [4]:
def remove_missing_data_row(data):
    indices = data.index.values
    indices_to_remove = []
    for idx in indices:
        if data.loc[idx]["weight"] == 0:
            indices_to_remove.append(idx)
            
    data = data.drop(indices_to_remove)
    return data

In [5]:
def get_slope_from_data(df):
    X = np.array(df["Time"], dtype=float)
    Y = np.array(df["weight"], dtype=float)

    #least squares polynomial fit. Fit a polynomial of degree deg to points (x,y) np.polyfit(x, y, deg)
    z = np.polyfit(X, Y, 1)  
    
    #poly1d consructs the 1d polynomial from the given coefficients
    p = np.poly1d(z) 
    
    #linspace returns evenly spaced numbers over a specified interval np.linspace(start, stop, num_of_samples)
    xp = np.linspace(0, 22, 100)
    
    #Plotting
#     plt.plot(X,Y,'.',xp, p(xp),'-')
#     plt.show()
    return z[0]

def get_slope_from_x_y(X, Y):
    z = np.polyfit(X, Y, 1)
    p = np.poly1d(z)
    xp = np.linspace(0, 22, 100)
    return z[0]

def get_slope_diff_from_data(portion_1_data, portion_2_data):
    new_df_portion_1_values = portion_1_data.values
    new_df_portion_2_values = portion_2_data.values

    weight = 0
    time = 1
    
    X1 = np.array(new_df_portion_1_values[:,time], dtype=float)
    Y1 = np.array(new_df_portion_1_values[:,weight], dtype=float)
    X2 = np.array(new_df_portion_2_values[:,time], dtype=float)
    Y2 = np.array(new_df_portion_2_values[:,weight], dtype=float)
    
    return (get_slope_from_x_y(X1, Y1) - get_slope_from_x_y(X2, Y2))


In [6]:
def get_data_with_missing_values(data, portion_to_remove):
#     random.seed(10000)
    # Randomly removing data to create data containing missing values
    # portion_to_remove: fraction of data to be removed, eg. 0.1, 0.2, etc.
    chickweight_with_missing_values = data.copy()
    Y = chickweight_with_missing_values["weight"]
    for i in range(len(Y)):
        if random.random()<portion_to_remove:
            Y[i] = 0  ##Randomly making few weights 0 to mimic missing data
    return chickweight_with_missing_values

In [7]:
def find_median(data):
    data = [x for x in data if x != 0]
    data.sort()
    if(len(data) % 2 != 0):
        mididx = math.floor(len(data)/2)
        median = data[mididx]
    else:
        mididx1 = math.floor(len(data)/2)
        mididx2 = math.floor(len(data)/2)-1
        median = (data[mididx1] + data[mididx2]) /2
    return median

def find_mean(data):
    count = 0
    for ele in data:
        if ele==0:
            continue
        else:
            count += 1
    return sum(data)/count
  
def replace(data, replacement):
    data_array = np.array(data)
    for idx in range(len(data_array)):
        if data_array[idx] ==0:
            data_array[idx] = replacement
        else:
            continue
    return data_array

def replace_interpolation(data, replacement):
    r = 0
    data_array = np.array(data)
    for idx in range(len(data_array)):
        if data_array[idx] == 0:
            data_array[idx] = replacement[r]
            r += 1
        else:
            continue
    return data_array

In [8]:
def replace_missing_values(data_per_diet, replacement_parameter):
    new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet"])
    for i in range(22):
            temp_time = data_per_diet.loc[data_per_diet['Time'] == i]
            count_row = temp_time.shape[0]
            if(count_row==0):
                continue
            else:
                if(replacement_parameter == "mean"):
                    replacement = find_mean(temp_time["weight"])
                elif(replacement_parameter == "median"):
                    replacement = find_median(temp_time["weight"])
                temp_time["weight"] = replace(temp_time["weight"], replacement)
                new_df = pd.concat([new_df, temp_time], ignore_index=True)
    return new_df

In [9]:
def replacement_by_interpolation(data_per_diet):
    new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet"])
    chicks = data_per_diet.Chick.unique()
    for chicknum in chicks:
        temp_chick = data_per_diet.loc[data_per_diet.Chick == chicknum]
        #get the known records (where weights are not missing)
        known = temp_chick.loc[temp_chick.weight != 0]
        #get the unknown records (where chick weight is missing)
        unknown = temp_chick.loc[temp_chick.weight == 0]
        x_known = known['Time'].values.tolist()
        y_known = known['weight'].values.tolist()
        x_unknown = unknown['Time'].values.tolist()
        #use interpolation to find the y_unknowns
        y_unknown = np.interp(x_unknown, x_known, y_known)
        replacement = y_unknown
        temp_chick["weight"] = replace_interpolation(temp_chick["weight"], replacement)
        new_df = pd.concat([new_df, temp_chick], ignore_index=True)
    return new_df

In [10]:
def get_new_slope(data, replacement_parameter):
    if replacement_parameter == "remove_missing":
        temp_diet_new = remove_missing_data_row(data)
    elif replacement_parameter == "linear_interpolation":
        temp_diet_new = replacement_by_interpolation(data)
    else:
        temp_diet_new = replace_missing_values(data, replacement_parameter)
    slope = get_slope_from_data(temp_diet_new)
    return slope

In [11]:
def shuffle_missing_labels(data):
    new_df = pd.DataFrame(columns=["weight","Time","Chick","Diet", "missing"])
    for i in range(22):
        temp = data.loc[data['Time']==i]
        count_row = temp.shape[0]
        if(count_row==0):
            continue
        else:
            temp['missing'] = np.random.permutation(temp['missing'].values)
            new_df = pd.concat([new_df, temp], ignore_index=True)
    return new_df

In [12]:
def significance_test_missing_data_slope(portion, replacement_parameter):

    portion1 = 0.0
    portion2 = portion
    
    chickweight_with_missing_values = get_data_with_missing_values(chickweight_one_diet, portion_to_remove=portion)
    chickweight_with_missing_values["missing"] = [portion]*len(chickweight_with_missing_values.index)
    
    chickweight_original = chickweight_one_diet.copy()
    chickweight_original["missing"] = [0.0]*len(chickweight_with_missing_values.index)
    
    observed_slope_diff = get_new_slope(chickweight_with_missing_values, replacement_parameter) -  get_slope_from_data(chickweight_original)

    
    data = pd.concat([chickweight_original,chickweight_with_missing_values])
        
    count = 0
    num_shuffles = 100
        
    for i in range(num_shuffles):
#         if(i!=0 and i%20 ==0):
#             print("Done: ", i)
        new_df = shuffle_missing_labels(data)

        new_df_portion_1 = new_df[:][new_df.missing == portion1]
        new_df_portion_2 = new_df[:][new_df.missing == portion2]
        
        slope_diff = get_slope_diff_from_data(new_df_portion_1,new_df_portion_2)   
        
        if observed_slope_diff < 0 and slope_diff <= observed_slope_diff:
            count += 1
        elif observed_slope_diff >= 0 and slope_diff >= observed_slope_diff:
            count += 1
    ######################################
    #
    # Output
    #
    ######################################
    print("**********Removing ", portion*100, "% of data****************")

    print ("Observed difference of two slopes: %.2f" % observed_slope_diff)
    print (count, "out of", num_shuffles, "experiments had a difference of two slopes", end=" ")
    if observed_slope_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_slope_diff, ".")
    print ("The chance of getting a difference of two slopes", end=" ")
    if observed_slope_diff < 0:
        print ("less than or equal to", end=" ")
    else:
        print ("greater than or equal to", end=" ")
    print ("%.2f" % observed_slope_diff, "is %.4f"%(count / float(num_shuffles)), "\n")

In [13]:
portion_to_remove = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
for portion in portion_to_remove:
    significance_test_missing_data_slope(portion, "remove_missing")

**********Removing  10.0 % of data****************
Observed difference of two slopes: -0.06
46 out of 100 experiments had a difference of two slopes less than or equal to -0.06 .
The chance of getting a difference of two slopes less than or equal to -0.06 is 0.4600 

**********Removing  20.0 % of data****************
Observed difference of two slopes: 0.26
29 out of 100 experiments had a difference of two slopes greater than or equal to 0.26 .
The chance of getting a difference of two slopes greater than or equal to 0.26 is 0.2900 

**********Removing  30.0 % of data****************
Observed difference of two slopes: -0.14
43 out of 100 experiments had a difference of two slopes less than or equal to -0.14 .
The chance of getting a difference of two slopes less than or equal to -0.14 is 0.4300 

**********Removing  40.0 % of data****************
Observed difference of two slopes: 0.05
52 out of 100 experiments had a difference of two slopes greater than or equal to 0.05 .
The chance of

In [14]:
portion_to_remove = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
for portion in portion_to_remove:
    significance_test_missing_data_slope(portion, "mean")

**********Removing  10.0 % of data****************
Observed difference of two slopes: 0.09
44 out of 100 experiments had a difference of two slopes greater than or equal to 0.09 .
The chance of getting a difference of two slopes greater than or equal to 0.09 is 0.4400 

**********Removing  20.0 % of data****************
Observed difference of two slopes: -0.06
49 out of 100 experiments had a difference of two slopes less than or equal to -0.06 .
The chance of getting a difference of two slopes less than or equal to -0.06 is 0.4900 

**********Removing  30.0 % of data****************
Observed difference of two slopes: -0.00
48 out of 100 experiments had a difference of two slopes less than or equal to -0.00 .
The chance of getting a difference of two slopes less than or equal to -0.00 is 0.4800 

**********Removing  40.0 % of data****************
Observed difference of two slopes: 0.08
47 out of 100 experiments had a difference of two slopes greater than or equal to 0.08 .
The chance of

In [15]:
portion_to_remove = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
for portion in portion_to_remove:
    significance_test_missing_data_slope(portion, "median")

**********Removing  10.0 % of data****************
Observed difference of two slopes: 0.03
52 out of 100 experiments had a difference of two slopes greater than or equal to 0.03 .
The chance of getting a difference of two slopes greater than or equal to 0.03 is 0.5200 

**********Removing  20.0 % of data****************
Observed difference of two slopes: -0.14
35 out of 100 experiments had a difference of two slopes less than or equal to -0.14 .
The chance of getting a difference of two slopes less than or equal to -0.14 is 0.3500 

**********Removing  30.0 % of data****************
Observed difference of two slopes: 0.14
40 out of 100 experiments had a difference of two slopes greater than or equal to 0.14 .
The chance of getting a difference of two slopes greater than or equal to 0.14 is 0.4000 

**********Removing  40.0 % of data****************
Observed difference of two slopes: 0.49
32 out of 100 experiments had a difference of two slopes greater than or equal to 0.49 .
The chance

### Linear Interpolation

In mathematics, linear interpolation is a method of curve fitting using linear polynomials to construct new data points within the range of a discrete set of known data points. In order to use Linear interpolation, we use the individual chick data and impute the missing values for that chick. We repeat this process for all the chicks to impute all the missing data.

In [17]:
portion_to_remove = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
for portion in portion_to_remove:
    significance_test_missing_data_slope(portion, "linear_interpolation")

**********Removing  10.0 % of data****************
Observed difference of two slopes: -0.02
49 out of 100 experiments had a difference of two slopes less than or equal to -0.02 .
The chance of getting a difference of two slopes less than or equal to -0.02 is 0.4900 

**********Removing  20.0 % of data****************
Observed difference of two slopes: -0.00
48 out of 100 experiments had a difference of two slopes less than or equal to -0.00 .
The chance of getting a difference of two slopes less than or equal to -0.00 is 0.4800 



ValueError: array of sample points is empty