In [26]:
import pandas as pd
import pickle
import numpy as np
import xgboost as xgb
from scipy import stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [43]:
dataset = 'C:/Users/Pranay/Desktop/Regression with a Crab Age Dataset/Dataset/'
test = pd.read_csv(dataset + 'test.csv', index_col = 'id')

In [44]:
test.shape

(49368, 8)

In [45]:
test['Height'] = test['Height'].replace(0,0.01)

In [46]:
test['Length_transform'],_ = stats.boxcox(test['Length'])

p = 0.85  # Percentile threshold
lower_threshold = np.percentile(test['Length_transform'], p)
upper_threshold = np.percentile(test['Length_transform'], 100 - p)

test['Length_transform'] = np.clip( test['Length_transform'], lower_threshold, upper_threshold )

###########################

test['Diameter_transform'],_ = stats.boxcox(test['Diameter'])

p = 0.8  # Percentile threshold
lower_threshold = np.percentile(test['Diameter_transform'], p)
upper_threshold = np.percentile(test['Diameter_transform'], 100 - p)

test['Diameter_transform'] = np.clip( test['Diameter_transform'], lower_threshold, upper_threshold )

###########################

test['Height_transform'],_ = stats.boxcox(test['Height'])

p = 0.8  # Percentile threshold
lower_threshold = np.percentile(test['Height_transform'], p)
upper_threshold = np.percentile(test['Height_transform'], 100 - p)

test['Height_transform'] = np.clip( test['Height_transform'], lower_threshold, upper_threshold )

###########################

test['Weight_transform'],_ = stats.boxcox(test['Weight'])

p = 0.8  # Percentile threshold
lower_threshold = np.percentile(test['Weight_transform'], p)
upper_threshold = np.percentile(test['Weight_transform'], 100 - p)

test['Weight_transform'] = np.clip( test['Weight_transform'], lower_threshold, upper_threshold )

###########################

test['Shucked_Weight_transform'],_ = stats.boxcox(test['Shucked Weight'])

p = 0.8  # Percentile threshold
lower_threshold = np.percentile(test['Shucked_Weight_transform'], p)
upper_threshold = np.percentile(test['Shucked_Weight_transform'], 100 - p)

test['Shucked_Weight_transform'] = np.clip( test['Shucked_Weight_transform'], lower_threshold, upper_threshold )

###########################

test['Viscera_Weight_transform'],_ = stats.boxcox(test['Viscera Weight'])

p = 0.8  # Percentile threshold
lower_threshold = np.percentile(test['Viscera_Weight_transform'], p)
upper_threshold = np.percentile(test['Viscera_Weight_transform'], 100 - p)

test['Viscera_Weight_transform'] = np.clip( test['Viscera_Weight_transform'], lower_threshold, upper_threshold )

###########################

test['Shell_Weight_transform'],_ = stats.boxcox(test['Shell Weight'])

p = 0.8  # Percentile threshold
lower_threshold = np.percentile(test['Shell_Weight_transform'], p)
upper_threshold = np.percentile(test['Shell_Weight_transform'], 100 - p)

test['Shell_Weight_transform'] = np.clip( test['Shell_Weight_transform'], lower_threshold, upper_threshold )

In [47]:
test['Volume'] = 3.14 *  (test['Diameter_transform']/2) * (test['Diameter_transform']/2) * test['Height_transform']
test['Density'] = test['Weight_transform']/test['Volume']
test['Aspect_ratio'] = test['Length_transform']/test['Diameter_transform']
test['BMI'] = test['Weight_transform']/(test['Height_transform'] * test['Height_transform'])

test['Shucked_by_wt'] = test['Shucked_Weight_transform']/test['Weight_transform']
test['Viscera_by_wt'] = test['Viscera_Weight_transform']/test['Weight_transform']
test['Shell_by_wt'] = test['Shell_Weight_transform']/test['Weight_transform']

test['Length_into_Height'] = test['Length_transform'] * test['Height_transform']
test['Length_into_Diameter'] = test['Length_transform'] * test['Diameter_transform']
test['Length_into_Weight'] = test['Length_transform'] * test['Weight_transform']

#Euclidean norm or the 2-norm (L2-norm)
test['L2_weights'] = np.sqrt(test['Shucked_Weight_transform']**2 + test['Viscera_Weight_transform']**2 + test['Shell_Weight_transform']**2)

In [48]:
# Rename column with space
test = test.rename(columns={'Shucked Weight': 'Shucked_Weight','Viscera Weight' : 'Viscera_Weight','Shell Weight':'Shell_Weight'})

In [49]:
X_test = pd.get_dummies(test, columns = ['Sex'])
X_test.drop(columns = ['Sex_M'],inplace = True)

In [50]:
with open('C:/Users/Pranay/Desktop/Regression with a Crab Age Dataset/mk/test_columns.pkl', 'rb') as file:
    test_columns = pickle.load(file)
    
X_test = X_test[test_columns]

In [51]:
# Create an instance of the StandardScaler
scaler = MinMaxScaler()

# Fit and transform the data
X_transformed = scaler.fit_transform(X_test)
# Convert the transformed array back to a DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=X_test.columns)

In [52]:
# Assuming you have trained and obtained your model object called 'model'
with open('C:/Users/Pranay/Desktop/Regression with a Crab Age Dataset/models/model_xgboost.pkl', 'rb') as file:
    model = pickle.load(file)
    
# Make predictions on the testing data
y_pred = model.predict(X_transformed_df)

In [53]:
prediction = pd.DataFrame(data=y_pred, index=X_test.index, columns=['Age'])
prediction.reset_index(inplace = True)
prediction.rename(columns = {'index':'id'}, inplace = True)

In [54]:
prediction.to_csv('C:/Users/Pranay/Desktop/Regression with a Crab Age Dataset/output/prediction_004.csv',index=False)

In [55]:
prediction

Unnamed: 0,id,Age
0,74051,7.805009
1,74052,8.056784
2,74053,9.418727
3,74054,9.706512
4,74055,7.725410
...,...,...
49363,123414,8.742044
49364,123415,7.718430
49365,123416,10.033973
49366,123417,9.560497
