#### Script to Predict the output on new observations

In [31]:
# !pip install Flask==3.0.3
# !pip install matplotlib==3.9.2
# !pip install numpy==2.1.2
# !pip install pandas==2.2.3
# !pip install scikit_learn==1.5.2
# !pip install seaborn==0.13.2

In [32]:
import pandas as pd
import pickle
import numpy as np

pd.options.mode.copy_on_write = True 

In [33]:
from zipfile import ZipFile
import urllib.request
from io import BytesIO

In [34]:
# folder = urllib.request.urlopen('https://s3.amazonaws.com/hackerday.datascience/50/dataset.zip')
with open('data/dataset.zip', 'rb') as folder:
    zipfile = ZipFile(BytesIO(folder.read()))

zipfile.namelist()

['test.csv',
 '__MACOSX/',
 '__MACOSX/._test.csv',
 'train.csv',
 '__MACOSX/._train.csv',
 'test_data_subset.csv',
 '__MACOSX/._test_data_subset.csv']

In [35]:
test_data = pd.read_csv(zipfile.open("test_data_subset.csv"))    

In [36]:
#load the columns to drop file
columns_to_drop=pd.read_csv("data/columns_to_drop.csv")
columns_to_Retain = list(set(test_data.columns.values) - set(columns_to_drop.colnames.values))
test_data = test_data[columns_to_Retain]

column_datatypes = test_data.dtypes
categorical_columns = list(column_datatypes[column_datatypes=="object"].index.values)

In [37]:
#Transfor the categorical columns by loading the fit encodings on training data

for cf1 in categorical_columns:
    filename = cf1+".sav"
    le = pickle.load(open(filename, 'rb'))
    
    #if an new classes is observed, set it to the 0 class
    le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
    test_data[cf1]=test_data[cf1].apply(lambda x: le_dict.get(x, -1))

In [38]:
test_data_id = test_data['id']
test_data = test_data.drop('id',axis=1)

In [39]:
Column_datatypes= test_data.dtypes
Integer_columns = list(Column_datatypes.where(lambda x: x =="int64").dropna().index.values)
#convert the int64 columns categorical
test_data[Integer_columns] = test_data[Integer_columns].astype('category',copy=False)

In [40]:
#load the saved model and predict on the test data
tunedmodel_rf = pickle.load(open("trained_models/tunedmodel_rf", 'rb'))

In [41]:
expected_features = tunedmodel_rf.feature_names_in_
array = np.array(expected_features)
print("Expected features:", array)

Expected features: ['cat1' 'cat15' 'cat22' 'cat32' 'cat73' 'cat75' 'cat77' 'cat88' 'cat89'
 'cat90' 'cat93' 'cat94' 'cat95' 'cat96' 'cat99' 'cat100' 'cat101'
 'cat102' 'cat103' 'cat104' 'cat105' 'cat108' 'cat109' 'cat110' 'cat112'
 'cat113' 'cat115' 'cat116' 'cont1' 'cont2' 'cont3' 'cont4' 'cont5'
 'cont7' 'cont8' 'cont9' 'cont11' 'cont13' 'cont14']


In [42]:
test_data = test_data.reindex(array, axis=1)
test_data.columns

Index(['cat1', 'cat15', 'cat22', 'cat32', 'cat73', 'cat75', 'cat77', 'cat88',
       'cat89', 'cat90', 'cat93', 'cat94', 'cat95', 'cat96', 'cat99', 'cat100',
       'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat108', 'cat109',
       'cat110', 'cat112', 'cat113', 'cat115', 'cat116', 'cont1', 'cont2',
       'cont3', 'cont4', 'cont5', 'cont7', 'cont8', 'cont9', 'cont11',
       'cont13', 'cont14'],
      dtype='object')

In [43]:
test_data

Unnamed: 0,cat1,cat15,cat22,cat32,cat73,cat75,cat77,cat88,cat89,cat90,...,cont2,cont3,cont4,cont5,cont7,cont8,cont9,cont11,cont13,cont14
0,0,0,0,0,0,0,3,0,0,0,...,0.299102,0.246911,0.402922,0.281143,0.317681,0.61229,0.34365,0.377724,0.704052,0.392562
1,0,0,0,0,0,1,3,0,0,0,...,0.620805,0.65431,0.946616,0.836443,0.44376,0.7133,0.5189,0.689039,0.453468,0.208045
2,0,0,0,0,0,0,3,0,1,0,...,0.737068,0.711159,0.412789,0.718531,0.325779,0.29758,0.34365,0.24541,0.258586,0.297232
3,0,0,0,0,0,0,3,0,0,0,...,0.681761,0.592681,0.354893,0.397069,0.342355,0.40028,0.33237,0.348867,0.592264,0.555955
4,1,0,0,0,0,0,3,0,0,0,...,0.299102,0.26357,0.696873,0.302678,0.391833,0.23688,0.43731,0.359572,0.301535,0.825823


In [29]:
Y_test_predict = tunedmodel_rf.predict(test_data)
test_data['predict_loss']=Y_test_predict
test_data['id']=test_data_id
test_data = test_data[['id','predict_loss']]

In [30]:
test_data

Unnamed: 0,id,predict_loss
0,4,7.791687
1,6,7.679563
2,9,9.01509
3,12,8.160236
4,15,7.049665
