In [1]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Load the training data
train = pd.read_csv("/kaggle/input/tfugkol/train.csv", index_col=0)
test = pd.read_csv("/kaggle/input/tfugkol/test.csv", index_col=0)

# Preview the data
train.head()

Unnamed: 0_level_0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.877618,0.724417,0.895799,0.42165,0.281421,0.124454,0.923191,0.719903,0.701915,0.802461,...,A,B,A,A,I,E,D,A,B,6.994023
2,0.326679,0.613252,0.593413,0.34623,0.282354,0.357438,0.437627,0.808464,0.741289,0.546056,...,A,B,A,B,F,E,B,A,A,8.071256
3,0.869133,0.264104,0.86562,0.369602,0.293756,0.454644,0.732209,0.828352,0.695561,0.825251,...,A,B,A,A,N,B,D,C,A,5.760456
4,0.809799,0.494269,0.868099,0.57893,0.769785,0.153735,0.705142,0.614766,0.698125,0.794402,...,A,B,A,A,K,E,D,C,A,7.806457
6,0.343457,0.724447,0.440967,0.70594,0.279105,0.496212,0.486063,0.297743,0.683073,0.462146,...,A,B,A,A,F,E,B,A,B,6.868974


In [3]:
# Separate target from features
y = train['target']
features = train.drop(['target'], axis=1)

# Preview features
features.head()

Unnamed: 0_level_0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.877618,0.724417,0.895799,0.42165,0.281421,0.124454,0.923191,0.719903,0.701915,0.802461,...,C,A,B,A,A,I,E,D,A,B
2,0.326679,0.613252,0.593413,0.34623,0.282354,0.357438,0.437627,0.808464,0.741289,0.546056,...,A,A,B,A,B,F,E,B,A,A
3,0.869133,0.264104,0.86562,0.369602,0.293756,0.454644,0.732209,0.828352,0.695561,0.825251,...,C,A,B,A,A,N,B,D,C,A
4,0.809799,0.494269,0.868099,0.57893,0.769785,0.153735,0.705142,0.614766,0.698125,0.794402,...,G,A,B,A,A,K,E,D,C,A
6,0.343457,0.724447,0.440967,0.70594,0.279105,0.496212,0.486063,0.297743,0.683073,0.462146,...,C,A,B,A,A,F,E,B,A,B


In [4]:
# List of categorical columns
object_cols = [col for col in features.columns if 'cat' in col]

# ordinal-encode categorical columns
X = features.copy()
X_test = test.copy()
ordinal_encoder = OrdinalEncoder()
X[object_cols] = ordinal_encoder.fit_transform(features[object_cols])
X_test[object_cols] = ordinal_encoder.transform(test[object_cols])

# Preview the ordinal-encoded features
X.head()

Unnamed: 0_level_0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.877618,0.724417,0.895799,0.42165,0.281421,0.124454,0.923191,0.719903,0.701915,0.802461,...,2.0,0.0,1.0,0.0,0.0,8.0,4.0,3.0,0.0,1.0
2,0.326679,0.613252,0.593413,0.34623,0.282354,0.357438,0.437627,0.808464,0.741289,0.546056,...,0.0,0.0,1.0,0.0,1.0,5.0,4.0,1.0,0.0,0.0
3,0.869133,0.264104,0.86562,0.369602,0.293756,0.454644,0.732209,0.828352,0.695561,0.825251,...,2.0,0.0,1.0,0.0,0.0,13.0,1.0,3.0,2.0,0.0
4,0.809799,0.494269,0.868099,0.57893,0.769785,0.153735,0.705142,0.614766,0.698125,0.794402,...,6.0,0.0,1.0,0.0,0.0,10.0,4.0,3.0,2.0,0.0
6,0.343457,0.724447,0.440967,0.70594,0.279105,0.496212,0.486063,0.297743,0.683073,0.462146,...,2.0,0.0,1.0,0.0,0.0,5.0,4.0,1.0,0.0,1.0


In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

In [6]:
# Define the model 
model = RandomForestRegressor(random_state=1)

# Train the model (will take about 10 minutes to run)
model.fit(X_train, y_train)
preds_valid = model.predict(X_valid)
print(mean_squared_error(y_valid, preds_valid, squared=False))

0.8600095831820258


In [7]:
# Use the model to generate predictions
predictions = model.predict(X_test)

# Save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test.index,
                       'target': predictions})
output.to_csv('submission.csv', index=False)