In [1]:
from sklearn.linear_model import LassoCV,Lasso,RidgeCV,Ridge,ElasticNetCV,ElasticNet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def mod_ada_lasso(X_train,y_train,X_test):
    
    # apply ridge regression to determine the penalty weights
    rcv = RidgeCV()
    rcv.fit(X_train,y_train)
    ridge_coefs = rcv.coef_
    
    # apply lasso with penalization
    enet = ElasticNet(l1_ratio=1/abs(min(ridge_coefs)))
    enet.fit(X_train,y_train)
    impute_values = enet.predict(X_test)

    return impute_values

In [3]:
def mod_elastic_net(X_train,y_train,X_test):
    enet = ElasticNet(alpha=0.5)
    enet.fit(X_train,y_train)
    
    impute_values = enet.predict(X_test)
    
    return impute_values

In [4]:
def mod_scad(X_train,y_train,X_test):
    enet = ElasticNet(l1_ratio=3.7) # SCAD penalty default value
    enet.fit(X_train,y_train)
    impute_values = enet.predict(X_test)
    
    return impute_values

In [5]:
X = pd.read_excel('./clean_pls_predictors.xlsx')

In [6]:
X.head(2)

Unnamed: 0,glucan,xylan,lignin,ash,Volatiles db,Ash db,Carbon db,Nitrogen db,ADSCI-growing season,ADSCI-30 days prior to harvest,...,50-100 Site SOM,50-100 Site Soil total organic carbon,50-100 Site Total Soil nitrogen,50-100 Site Extractable P mg kg-1,50-100 Site Extractable K mg kg-1,50-100 Site Extractable Ca mg kg-1,50-100 Site Extractable Mg mg kg-1,50-100 Site Extractable S mg kg-1,50-100 Site BD g cm-3,Nitrogen Trt Categorical
0,36.1935,21.764114,19.238295,6.586029,81.223333,5.776667,47.813333,0.393333,0.0,0.0,...,1.9,1.11,0.11,47.1,144,1613,154,17.1,1.69,1
1,40.410339,22.226731,21.094115,3.712055,82.44,3.943333,48.763333,0.156667,0.0,0.0,...,1.9,1.08,0.11,39.5,130,1686,177,17.4,1.66,1


In [7]:
temp_X = np.array(X)

In [8]:
NAs_found = np.where(np.isnan(temp_X[:,4])==True)[0].tolist()

In [9]:
np.delete(temp_X[:,4],NAs_found).mean()

83.26533959305165

In [10]:
temp_missing_data = {}

In [11]:
for col in range(temp_X.shape[1]):
    NAs_found = np.where(np.isnan(temp_X[:,col])==True)[0].tolist()
    if len(NAs_found) != 0:
        current_mean = np.delete(temp_X[:,col],NAs_found).mean()
        temp_list = {i:current_mean for i in NAs_found}
        temp_missing_data.update({str(col):temp_list})

In [16]:
class mouse:
    
    def __init__(self,X,prediciton_algorithm,max_iterations,stopping_criterion):
        self.X = X
        self.prediciton_algorithm = prediciton_algorithm
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.X_mean = None
        self.X_sd = None
        self.y_mean = None
        self.y_sd = None
        self.current_column_name = None
        self.current_column_index = None
        self.max_iterations=max_iterations
        self.stopping_criterion = stopping_criterion
        self.iterations_performed = None,
        
        temp_X = np.array(self.X)
#         print(self.X.shape)
        self.n = self.X.shape[0]
        self.p = self.X.shape[1]
        
        temp_missing_data = {}
    
        for col in range(temp_X.shape[1]):
            NAs_found = np.where(np.isnan(temp_X[:,col])==True)[0].tolist()
            if len(NAs_found) != 0:
                current_mean = np.delete(temp_X[:,col],NAs_found).mean()
                temp_list = {i:current_mean for i in NAs_found}
                temp_X[NAs_found,col] = current_mean
                temp_missing_data.update({str(col):temp_list})
                
        self.X = temp_X
        self.q = len(temp_missing_data)
        self.missing_data = temp_missing_data
#         print(self.missing_data)
        self.X_old = temp_X*100
        
        
# Set_column_response function used for checking datatype and assing the value based on the datatype
        
    def set_column_response(self,input_index):
        if type(input_index) == int:
            self.current_column_name = str(input_index) 
            self.current_column_index = input_index
#             print(self.current_column_name)
            
        else:
            self.current_column_name = input_index
            self.current_column_index = int(input_index)
#             print(self.current_column_name)
        
            
    
# Dividing the data into training,test and finding mean and standard deviation
    def set_data_split(self):
        
#         print(self.missing_data[self.current_column_name].keys())
        
        row_missing_index = self.missing_data[self.current_column_name].keys()
                
        ### X train ###
        temp_X_train = np.delete(self.X,self.current_column_index,axis=1)
        temp_X_train = np.delete(temp_X_train,list(row_missing_index),axis=0)

        ### X test ###
        temp_X_test = np.delete(self.X,self.current_column_index,axis=1)
        temp_X_test = temp_X_test[list(row_missing_index),:]
        
        ### Y_train ###
        temp_Y_train = np.delete(self.X,list(row_missing_index),axis=0)
        temp_Y_train = temp_Y_train[:,self.current_column_index]

        ### Y_test ###
        temp_Y_test = self.X[list(row_missing_index),self.current_column_index]
        
        # center the data to the training set
        # this will change with each iteration of imputation
        self.X_mean = temp_X_train.mean()
        self.X_sd = temp_X_train.std()

        self.y_mean = temp_Y_train.mean()
        self.y_sd = temp_Y_train.std()

        self.y_train = (temp_Y_train - self.y_mean) / self.y_sd
        self.y_test = (temp_Y_test - self.y_mean) / self.y_sd

        temp_X_train = temp_X_train - self.X_mean
        tem_X_test = temp_X_test - self.X_mean
        
        self.X_train = temp_X_train/self.X_sd
        self.X_test = temp_X_test/self.X_sd
        
    
    def update_X(self):
        
        row_missing_index =  self.missing_data[self.current_column_name].keys()
        
        for i in row_missing_index:
            
            self.missing_data[self.current_column_name][i] = self.y_test*self.y_sd*self.y_mean
#         print(self.missing_data)
        
#         print(self.missing_data[self.current_column_name])
        self.X[list(self.missing_data[self.current_column_name].keys()),self.current_column_index] = self.y_test
      
    
# Update_X function updates the X matrix missing data wth y_pred.
    
    def run(self):
        
        for iters in range(self.max_iterations):
            self.iterations_performed = iters
            # 2. Regressing the missing values
            # random starting point for the missing data column
            
            for k in self.missing_data.keys():
                
                self.set_column_response(k)
                self.set_data_split()
                self.y_test = self.prediciton_algorithm(self.X_train,self.y_train,self.X_test)
                self.update_X()
                
            if np.linalg.norm(self.X - self.X_old) < self.stopping_criterion:
                
                break
                
            else:
                self.X_old = self.X
            
        return self.missing_data

In [17]:
mouse_class = mouse(X,prediciton_algorithm=mod_ada_lasso,
                    max_iterations=30,
                    stopping_criterion=1e-5)
mouse_class.run()

{'4': {195: array([73.14021642, 73.14021642, 73.14021642, 73.14021642, 73.14021642,
         73.14021642, 73.14021642]),
  199: array([73.14021642, 73.14021642, 73.14021642, 73.14021642, 73.14021642,
         73.14021642, 73.14021642]),
  200: array([73.14021642, 73.14021642, 73.14021642, 73.14021642, 73.14021642,
         73.14021642, 73.14021642]),
  206: array([73.14021642, 73.14021642, 73.14021642, 73.14021642, 73.14021642,
         73.14021642, 73.14021642]),
  212: array([73.14021642, 73.14021642, 73.14021642, 73.14021642, 73.14021642,
         73.14021642, 73.14021642]),
  213: array([73.14021642, 73.14021642, 73.14021642, 73.14021642, 73.14021642,
         73.14021642, 73.14021642]),
  214: array([73.14021642, 73.14021642, 73.14021642, 73.14021642, 73.14021642,
         73.14021642, 73.14021642])},
 '5': {195: array([-2.10500757, -2.10500757, -2.10500757, -2.20634006, -2.31917545,
         -2.31917545, -2.31917545]),
  199: array([-2.10500757, -2.10500757, -2.10500757, -2.20634

In [87]:
class mice:
    
    def __init__(self,X,prediction_algorithm):
        
        self.X = X
        self.prediction_algorithm = prediction_algorithm
        self.num_replicates = 10
        self.max_iterations = 30
        self.stopping_criterion = 1e-5
        self.list_of_replicates = {}
        self.imputation_results = {}
        self.imputation_metrics = {}
        
    def run(self):
        mouse_class = mouse(X=self.X,prediciton_algorithm=self.prediction_algorithm,
                            max_iterations=self.max_iterations,
                            stopping_criterion=self.stopping_criterion)
        for m in range(self.num_replicates):
            self.list_of_replicates.update({m:mouse_class.run()})
            
#         print(self.list_of_replicates)
    
    # Double check with this code with Ross
    
    def collect(self):
        
        for i in range(self.num_replicates):
            for j in self.list_of_replicates[i].keys():
#                 print(j)
                if len(self.imputation_results) == 0:
                    
                    temp_index = list(self.list_of_replicates[i][j].keys())
                    temp_matrix = np.repeat(0,len(temp_index)*self.num_replicates).reshape(self.num_replicates,len(temp_index))
                    temp_matrix = pd.DataFrame(temp_matrix,columns=temp_index)
                    self.imputation_results.update({j:temp_matrix})
                    
                temp_value = self.list_of_replicates[i][j].values()
                self.imputation_results.update({j:{i:temp_value}})
                
#         print(self.imputation_results)
                
    def set_summary_stats(self):
        
        if len(self.imputation_results) == 0:
            self.collect()
            
        for j in self.imputation_results:
#             print(list(list(self.imputation_results[j].values())[0])[0])
            temp_result = {}
            num_samples = len(self.imputation_results[j])
            temp_result['mean'] = np.mean(list(list(self.imputation_results[j].values())[0])[0])
            temp_result['sd'] = np.std(list(list(self.imputation_results[j].values())[0])[0])
            
            standard_error = 1.96*temp_result['sd']/np.sqrt(num_samples)
            temp_result['upper_ci'] = temp_result['mean'] + standard_error
            temp_result['lower_ci'] = temp_result['mean'] - standard_error
            
            self.imputation_metrics.update({j:temp_result})
            
        print(self.imputation_metrics)
                     

In [22]:
X = pd.read_excel('./clean_pls_predictors.xlsx')

In [88]:
mice_class = mice(X,prediction_algorithm=mod_ada_lasso)
mice_class.run()
mice_class.set_summary_stats()

{'4': {'mean': 73.14021642320411, 'sd': 0.0, 'upper_ci': 73.14021642320411, 'lower_ci': 73.14021642320411}, '5': {'mean': -2.2112698761697493, 'sd': 0.09916089018128679, 'upper_ci': -2.016914531414427, 'lower_ci': -2.4056252209250717}, '6': {'mean': -5.531880862498007, 'sd': 1.245096138082086, 'upper_ci': -3.091492431857119, 'lower_ci': -7.972269293138895}, '7': {'mean': -0.022869689588973525, 'sd': 3.469446951953614e-18, 'upper_ci': -0.022869689588973518, 'lower_ci': -0.022869689588973532}}


In [43]:
dict1 = {}

In [44]:
len(dict1)

0