In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/competition/HD_AI_Challenge/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/competition/HD_AI_Challenge/test.csv')

In [None]:
train_df

In [None]:
# Identify categorical and numerical columns
categorical_cols = [cname for cname in train_df.columns if
                    train_df[cname].dtype == "object" and
                    cname != "SAMPLE_ID"]
numerical_cols = [cname for cname in train_df.columns if
                  train_df[cname].dtype in ['int64', 'float64'] and
                  cname != "CI_HOUR"]

In [None]:
# Preprocessing for numerical data: imputation and scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

In [None]:
# Custom transformer for frequency encoding
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.freq_map = {}

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X, columns=self.columns)
        for col in self.columns:
            self.freq_map[col] = X_df[col].value_counts()
        return self

    def transform(self, X, y=None):
        X_transformed = pd.DataFrame(X, columns=self.columns).copy()
        for col in self.columns:
            X_transformed[col] = X_transformed[col].map(self.freq_map[col])
        return X_transformed

In [None]:
# Adjust the categorical transformer to pass the column names to the FrequencyEncoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('frequency', FrequencyEncoder(columns=categorical_cols))])

In [None]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [None]:
# Define the model
model = HistGradientBoostingRegressor()

In [None]:
# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

In [None]:
# Separate target from predictors
X = train_df.drop(["SAMPLE_ID", "CI_HOUR"], axis=1)
y = train_df["CI_HOUR"]

In [None]:
# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
# Preprocessing of training data and train model
pipeline.fit(X_train, y_train)

In [None]:
# Preprocessing of validation data and get predictions
preds = pipeline.predict(X_valid)

In [None]:
# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

In [None]:
ci_hour_stats = train_df["CI_HOUR"].describe()
print(ci_hour_stats)

In [None]:
# Predict on test data
X_test = test_df.drop("SAMPLE_ID", axis=1)
test_preds = pipeline.predict(X_test)

In [None]:
# Save predictions
output = pd.DataFrame({'SAMPLE_ID': test_df.SAMPLE_ID, 'CI_HOUR': test_preds})
output.to_csv('/content/drive/MyDrive/competition/HD_AI_Challenge/sample_submission.csv', index=False)