In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import math
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [None]:
#Import Data
data = pd.read_csv("data.csv")
samp_sol = pd.read_csv("sample_submission.csv")
#print(data)

In [None]:
#Create imputation function
def impute_values(dataframe):
    # Split the dataframe into complete and missing subsets
    complete_data = dataframe.dropna(subset=['x_e_out [-]'])
    complete_data.to_csv('complete_data.csv') 
    missing_data = dataframe[dataframe['x_e_out [-]'].isnull()]
    missing_data.to_csv('missing_data.csv') 

    # Create the regression model
    reg_model = LinearRegression()
    
    # Fit the regression model
    X = complete_data[['pressure [MPa]', 'mass_flux [kg/m2-s]', 'D_e [mm]', 'D_h [mm]', 'length [mm]', 'chf_exp [MW/m2]']]
    y = complete_data['x_e_out [-]']
    reg_model.fit( X , y)
        
    #provide R**2 score and equation
    print('R**2:' , reg_model.score(X, y))

    # Predict the missing values
    predicted_values = reg_model.predict(missing_data[['pressure [MPa]', 'mass_flux [kg/m2-s]', 'D_e [mm]', 'D_h [mm]', 'length [mm]', 'chf_exp [MW/m2]']])

    # Replace missing values with predicted values
    dataframe.loc[dataframe['x_e_out [-]'].isnull(), 'x_e_out [-]'] = predicted_values

    # Get the regression equation
    intercept = reg_model.intercept_
    #print('Intercept:' , intercept)
    coeff = reg_model.coef_
    print('regression equation:', intercept , '+' , coeff[0] , 'P +' , coeff[1] , 'j +' , coeff[2] , 'D_e +' , coeff[3] , 'D_h +' , coeff[4] , 'L +' , coeff[5] , 'CHF')
    
    #Solve for RMSE 
    N = len(y)
    y_hat = intercept + \
        (coeff[0] * complete_data['pressure [MPa]']) + \
        (coeff[1] * complete_data['mass_flux [kg/m2-s]']) + \
        (coeff[2] * complete_data['D_e [mm]']) + \
        (coeff[3] * complete_data['D_h [mm]']) + \
        (coeff[4] * complete_data['length [mm]']) + \
        (coeff[5] * complete_data['chf_exp [MW/m2]'])
        
    RMSE = (math.sqrt((1/N) * (sum((y - y_hat)**2))))
    print('RMSE:' , RMSE)
  
    
    

    # Return the updated dataframe and regression equation
    return dataframe , RMSE

In [None]:
#Max
#Fill all other columns with max of the column
data_max = data.copy()
data_max['pressure [MPa]'] = data_max['pressure [MPa]'].fillna(data_max['pressure [MPa]'].max())
data_max['mass_flux [kg/m2-s]'] = data_max['mass_flux [kg/m2-s]'].fillna(data_max['mass_flux [kg/m2-s]'].max())
data_max['D_e [mm]'] = data_max['D_e [mm]'].fillna(data_max['D_e [mm]'].max())
data_max['D_h [mm]'] = data_max['D_h [mm]'].fillna(data_max['D_h [mm]'].max())
data_max['length [mm]'] = data_max['length [mm]'].fillna(data_max['length [mm]'].max())
data_max['chf_exp [MW/m2]'] = data_max['chf_exp [MW/m2]'].fillna(data_max['chf_exp [MW/m2]'].max())
#print(data_max)
data_max.to_csv('data_max.csv')




#Solve for X_e_out
data_max = impute_values(data_max)
#print(data_max)

#Put solution in new df
max_sol = samp_sol.copy()
filtered_data_max = pd.merge(max_sol['id'], data_max, on='id', how='left')
filtered_data_max = filtered_data_max.dropna(subset=['x_e_out [-]'])
#print(filtered_data_max)
max_sol['x_e_out [-]'] = filtered_data_max['x_e_out [-]']
#print(max_sol)
max_sol.to_csv('max_sol.csv')

In [None]:
#Min
#Fill Columns with min
data_min = data.copy()
data_min['pressure [MPa]'] = data_min['pressure [MPa]'].fillna(data_min['pressure [MPa]'].min())
data_min['mass_flux [kg/m2-s]'] = data_min['mass_flux [kg/m2-s]'].fillna(data_min['mass_flux [kg/m2-s]'].min())
data_min['D_e [mm]'] = data_min['D_e [mm]'].fillna(data_min['D_e [mm]'].min())
data_min['D_h [mm]'] = data_min['D_h [mm]'].fillna(data_min['D_h [mm]'].min())
data_min['length [mm]'] = data_min['length [mm]'].fillna(data_min['length [mm]'].min())
data_min['chf_exp [MW/m2]'] = data_min['chf_exp [MW/m2]'].fillna(data_min['chf_exp [MW/m2]'].min())
#print(data_min)



#Solve for X_e_out
data_min = impute_values(data_min)
#print(data_min)

#put solutions in new df
min_sol = samp_sol.copy()
filtered_data_min = pd.merge(min_sol['id'], data_min, on='id', how='left')
filtered_data_min = filtered_data_min.dropna(subset=['x_e_out [-]'])
#print(filtered_data_min)
min_sol['x_e_out [-]'] = filtered_data_min['x_e_out [-]']
#print(min_sol)
min_sol.to_csv('min_sol.csv')

In [None]:
#Mean
#Find Means of input variables
data_mean = data.copy()
data_mean['pressure [MPa]'] = data_mean['pressure [MPa]'].fillna(data_mean['pressure [MPa]'].mean())
data_mean['mass_flux [kg/m2-s]'] = data_mean['mass_flux [kg/m2-s]'].fillna(data_mean['mass_flux [kg/m2-s]'].mean())
data_mean['D_e [mm]'] = data_mean['D_e [mm]'].fillna(data_mean['D_e [mm]'].mean())
data_mean['D_h [mm]'] = data_mean['D_h [mm]'].fillna(data_mean['D_h [mm]'].mean())
data_mean['length [mm]'] = data_mean['length [mm]'].fillna(data_mean['length [mm]'].mean())
data_mean['chf_exp [MW/m2]'] = data_mean['chf_exp [MW/m2]'].fillna(data_mean['chf_exp [MW/m2]'].mean())


#Solve for x_e_out
data_mean = impute_values(data_mean)

#Add solutions to new df
mean_sol = samp_sol.copy()
filtered_data_mean = pd.merge(mean_sol['id'], data_mean, on='id', how='left')
filtered_data_mean = filtered_data_mean.dropna(subset=['x_e_out [-]'])
#print(filtered_data_mean)
mean_sol['x_e_out [-]'] = filtered_data_mean['x_e_out [-]']
#print(mean_sol)
mean_sol.to_csv('mean_sol.csv')

In [None]:
#Median
#Find means of input variables
data_median = data.copy()
data_median['pressure [MPa]'] = data_median['pressure [MPa]'].fillna(data_median['pressure [MPa]'].median())
data_median['mass_flux [kg/m2-s]'] = data_median['mass_flux [kg/m2-s]'].fillna(data_median['mass_flux [kg/m2-s]'].median())
data_median['D_e [mm]'] = data_median['D_e [mm]'].fillna(data_median['D_e [mm]'].median())
data_median['D_h [mm]'] = data_median['D_h [mm]'].fillna(data_median['D_h [mm]'].median())
data_median['length [mm]'] = data_median['length [mm]'].fillna(data_median['length [mm]'].median())
data_median['chf_exp [MW/m2]'] = data_median['chf_exp [MW/m2]'].fillna(data_median['chf_exp [MW/m2]'].median())



#Solve for x_e_out
data_median = impute_values(data_median)

#add solutions to new df
median_sol = samp_sol.copy()
filtered_data_median = pd.merge(median_sol['id'], data_median, on='id')
filtered_data_median = filtered_data_median.dropna(subset=['x_e_out [-]'])
#print(filtered_data_median)
median_sol['x_e_out [-]'] = filtered_data_median['x_e_out [-]']
#print(median_sol)
median_sol.to_csv('median_sol.csv')

In [None]:
#Mode
data_mode = data.copy()
data_mode['pressure [MPa]'] = data_mode['pressure [MPa]'].fillna(data_mode['pressure [MPa]'].mode()[0])
data_mode['mass_flux [kg/m2-s]'] = data_mode['mass_flux [kg/m2-s]'].fillna(data_mode['mass_flux [kg/m2-s]'].mode()[0])
data_mode['D_e [mm]'] = data_mode['D_e [mm]'].fillna(data_mode['D_e [mm]'].mode()[0])
data_mode['D_h [mm]'] = data_mode['D_h [mm]'].fillna(data_mode['D_h [mm]'].mode()[0])
data_mode['length [mm]'] = data_mode['length [mm]'].fillna(data_mode['length [mm]'].mode()[0])
data_mode['chf_exp [MW/m2]'] = data_mode['chf_exp [MW/m2]'].fillna(data_mode['chf_exp [MW/m2]'].mode()[0])


#Solve for x_e_out
data_mode = impute_values(data_mode)

#add solutions to new df
mode_sol = samp_sol.copy()
filtered_data_mode = pd.merge(mode_sol['id'], data_mode, on='id', how='left')
filtered_data_mode = filtered_data_mode.dropna(subset=['x_e_out [-]'])
#print(filtered_data_mode)
mode_sol['x_e_out [-]'] = filtered_data_mode['x_e_out [-]']
mode_sol.to_csv('mode_sol.csv')

In [None]:
#Fixed Value (0)
#FV_0
#Fill all other columns with max of the column
data_FV_0 = data.copy()
data_FV_0['pressure [MPa]'] = data_FV_0['pressure [MPa]'].fillna(0)
data_FV_0['mass_flux [kg/m2-s]'] = data_FV_0['mass_flux [kg/m2-s]'].fillna(0)
data_FV_0['D_e [mm]'] = data_FV_0['D_e [mm]'].fillna(0)
data_FV_0['D_h [mm]'] = data_FV_0['D_h [mm]'].fillna(0)
data_FV_0['length [mm]'] = data_FV_0['length [mm]'].fillna(0)
data_FV_0['chf_exp [MW/m2]'] = data_FV_0['chf_exp [MW/m2]'].fillna(0)
#print(data_max)

#Solve for X_e_out
data_FV_0 = impute_values(data_FV_0)

#Put solution in new df
FV_0_sol = samp_sol.copy()
filtered_data_FV_0 = pd.merge(FV_0_sol['id'], data_FV_0, on='id', how='left')
filtered_data_FV_0 = filtered_data_FV_0.dropna(subset=['x_e_out [-]'])
#print(filtered_data_FV_0)
FV_0_sol['x_e_out [-]'] = filtered_data_FV_0['x_e_out [-]']
#print(FV_0_sol)
FV_0_sol.to_csv('FV_0_sol.csv')

In [None]:
#Fixed Value (1)
#FV_1
#Fill all other columns with max of the column
data_FV_1 = data.copy()
data_FV_1['pressure [MPa]'] = data_FV_1['pressure [MPa]'].fillna(1)
data_FV_1['mass_flux [kg/m2-s]'] = data_FV_1['mass_flux [kg/m2-s]'].fillna(1)
data_FV_1['D_e [mm]'] = data_FV_1['D_e [mm]'].fillna(1)
data_FV_1['D_h [mm]'] = data_FV_1['D_h [mm]'].fillna(1)
data_FV_1['length [mm]'] = data_FV_1['length [mm]'].fillna(1)
data_FV_1['chf_exp [MW/m2]'] = data_FV_1['chf_exp [MW/m2]'].fillna(1)
#print(data_max)

#Solve for X_e_out
data_FV_1 = impute_values(data_FV_1)

#Put solution in new df
FV_1_sol = samp_sol.copy()
filtered_data_FV_1 = pd.merge(FV_1_sol['id'], data_FV_1, on='id', how='left')
filtered_data_FV_1 = filtered_data_FV_1.dropna(subset=['x_e_out [-]'])
#print(filtered_data_FV_1)
FV_1_sol['x_e_out [-]'] = filtered_data_FV_1['x_e_out [-]']
#print(FV_1_sol)
FV_1_sol.to_csv('FV_1_sol.csv')

In [None]:
#Fixed Value (-1)
#FV_-1
#Fill all other columns with max of the column
data_FV_neg = data.copy()
data_FV_neg['pressure [MPa]'] = data_FV_neg['pressure [MPa]'].fillna(-1)
data_FV_neg['mass_flux [kg/m2-s]'] = data_FV_neg['mass_flux [kg/m2-s]'].fillna(-1)
data_FV_neg['D_e [mm]'] = data_FV_neg['D_e [mm]'].fillna(-1)
data_FV_neg['D_h [mm]'] = data_FV_neg['D_h [mm]'].fillna(-1)
data_FV_neg['length [mm]'] = data_FV_neg['length [mm]'].fillna(-1)
data_FV_neg['chf_exp [MW/m2]'] = data_FV_neg['chf_exp [MW/m2]'].fillna(-1)
#print(data_max)

#Solve for X_e_out
data_FV_neg = impute_values(data_FV_neg)

#Put solution in new df
FV_neg_sol = samp_sol.copy()
filtered_data_FV_neg = pd.merge(FV_neg_sol['id'], data_FV_neg, on='id', how='left')
filtered_data_FV_neg = filtered_data_FV_neg.dropna(subset=['x_e_out [-]'])
#print(filtered_data_FV_neg)
FV_neg_sol['x_e_out [-]'] = filtered_data_FV_neg['x_e_out [-]']
#print(FV_neg_sol)
FV_neg_sol.to_csv('FV_neg_sol.csv')