<a href="https://colab.research.google.com/github/Samiiee37/Dino/blob/main/catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Display all columns
pd.set_option('display.max_columns', None)


In [None]:
uploaded = files.upload()

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_solution.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample Submission shape:", sample_submission.shape)


Saving sample_solution.csv to sample_solution.csv
Saving test.csv to test.csv
Saving train.csv to train.csv
Train shape: (2000, 65)
Test shape: (500, 56)
Sample Submission shape: (500, 11)


In [None]:
# Identify property columns
property_cols = [col for col in train.columns if "_Property" in col]

def add_aggregate_features(df):
    df["prop_mean"] = df[property_cols].mean(axis=1)
    df["prop_std"] = df[property_cols].std(axis=1)
    df["prop_min"] = df[property_cols].min(axis=1)
    df["prop_max"] = df[property_cols].max(axis=1)
    df["prop_range"] = df["prop_max"] - df["prop_min"]
    df["prop_median"] = df[property_cols].median(axis=1)
    return df

train = add_aggregate_features(train)
test = add_aggregate_features(test)

In [None]:
def add_cross_component_stats(df):
    for prop_idx in range(1, 11):
        cols = [f"Component{i}_Property{prop_idx}" for i in range(1, 6)]
        df[f"Property{prop_idx}_mean"] = df[cols].mean(axis=1)
        df[f"Property{prop_idx}_std"] = df[cols].std(axis=1)
        df[f"Property{prop_idx}_min"] = df[cols].min(axis=1)
        df[f"Property{prop_idx}_max"] = df[cols].max(axis=1)
        df[f"Property{prop_idx}_range"] = df[f"Property{prop_idx}_max"] - df[f"Property{prop_idx}_min"]
    return df

train = add_cross_component_stats(train)
test = add_cross_component_stats(test)

In [None]:
# Clean old if exists
train = train.loc[:, ~train.columns.str.contains("^Weighted_Property")]
test = test.loc[:, ~test.columns.str.contains("^Weighted_Property")]
train = train.loc[:, ~train.columns.str.contains("^C[1-5]_P[1-9]_weighted|^C[1-5]_P10_weighted")]
test = test.loc[:, ~test.columns.str.contains("^C[1-5]_P[1-9]_weighted|^C[1-5]_P10_weighted")]

# Generate weighted interaction features
for i in range(1, 6):
    for j in range(1, 11):
        train[f"C{i}_P{j}_weighted"] = train[f"Component{i}_fraction"] * train[f"Component{i}_Property{j}"]
        test[f"C{i}_P{j}_weighted"] = test[f"Component{i}_fraction"] * test[f"Component{i}_Property{j}"]

# Aggregate weighted properties
weighted_train_props = {}
weighted_test_props = {}

for j in range(1, 11):
    cols = [f"C{i}_P{j}_weighted" for i in range(1, 6)]
    weighted_train_props[f"Weighted_Property{j}"] = train[cols].sum(axis=1)
    weighted_test_props[f"Weighted_Property{j}"] = test[cols].sum(axis=1)

train = pd.concat([train, pd.DataFrame(weighted_train_props)], axis=1)
test = pd.concat([test, pd.DataFrame(weighted_test_props)], axis=1)


  test[f"C{i}_P{j}_weighted"] = test[f"Component{i}_fraction"] * test[f"Component{i}_Property{j}"]
  train[f"C{i}_P{j}_weighted"] = train[f"Component{i}_fraction"] * train[f"Component{i}_Property{j}"]
  test[f"C{i}_P{j}_weighted"] = test[f"Component{i}_fraction"] * test[f"Component{i}_Property{j}"]
  train[f"C{i}_P{j}_weighted"] = train[f"Component{i}_fraction"] * train[f"Component{i}_Property{j}"]
  test[f"C{i}_P{j}_weighted"] = test[f"Component{i}_fraction"] * test[f"Component{i}_Property{j}"]
  train[f"C{i}_P{j}_weighted"] = train[f"Component{i}_fraction"] * train[f"Component{i}_Property{j}"]
  test[f"C{i}_P{j}_weighted"] = test[f"Component{i}_fraction"] * test[f"Component{i}_Property{j}"]
  train[f"C{i}_P{j}_weighted"] = train[f"Component{i}_fraction"] * train[f"Component{i}_Property{j}"]
  test[f"C{i}_P{j}_weighted"] = test[f"Component{i}_fraction"] * test[f"Component{i}_Property{j}"]
  train[f"C{i}_P{j}_weighted"] = train[f"Component{i}_fraction"] * train[f"Component{i}_Property{

In [None]:
for j in range(1, 11):
    cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
    train[f"Property{j}_diversity"] = train[cols].max(axis=1) - train[cols].min(axis=1)
    test[f"Property{j}_diversity"] = test[cols].max(axis=1) - test[cols].min(axis=1)

In [None]:
# Rank features
def add_rank_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
    return df

train = add_rank_features(train)
test = add_rank_features(test)

# Entropy features
from scipy.stats import entropy

def add_entropy_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[f"Property{j}_entropy"] = df[cols].apply(
            lambda row: entropy(np.abs(row) / np.sum(np.abs(row))), axis=1
        )
    return df

train = add_entropy_features(train)
test = add_entropy_features(test)

In [None]:
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]

X = train.drop(columns=target_cols)
y = train[target_cols]
X_test = test[X.columns]

In [None]:
!pip install catboost --quiet
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor

cat_model = MultiOutputRegressor(CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=100,
    task_type='CPU'
))
cat_model.fit(X, y)
cat_preds = cat_model.predict(X_test)


0:	learn: 0.9625451	total: 107ms	remaining: 2m 39s
100:	learn: 0.1605403	total: 5.57s	remaining: 1m 17s
200:	learn: 0.0766349	total: 10.3s	remaining: 1m 6s
300:	learn: 0.0520430	total: 16.4s	remaining: 1m 5s
400:	learn: 0.0401643	total: 21.2s	remaining: 58s
500:	learn: 0.0322700	total: 27.3s	remaining: 54.4s
600:	learn: 0.0259149	total: 32.1s	remaining: 48s
700:	learn: 0.0213980	total: 36.9s	remaining: 42s
800:	learn: 0.0176235	total: 42.9s	remaining: 37.5s
900:	learn: 0.0145711	total: 49.1s	remaining: 32.6s
1000:	learn: 0.0121127	total: 55.2s	remaining: 27.5s
1100:	learn: 0.0100012	total: 60s	remaining: 21.7s
1200:	learn: 0.0083490	total: 1m 6s	remaining: 16.4s
1300:	learn: 0.0069404	total: 1m 10s	remaining: 10.8s
1400:	learn: 0.0058258	total: 1m 15s	remaining: 5.34s
1499:	learn: 0.0047863	total: 1m 21s	remaining: 0us
0:	learn: 0.9699741	total: 47.9ms	remaining: 1m 11s
100:	learn: 0.1597918	total: 4.85s	remaining: 1m 7s
200:	learn: 0.0752179	total: 10.9s	remaining: 1m 10s
300:	learn: 

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor

# Identify all numerical columns (should be all of them)
numerical_cols = X.columns.tolist()

# Define column transformer to scale all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols)
    ]
)

# Define the CatBoost pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MultiOutputRegressor(CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        verbose=100,
        task_type='CPU'
    )))
])

# Fit the pipeline on training data
pipeline.fit(X, y)

# Predict on test data
final_preds = pipeline.predict(X_test)

# Create submission DataFrame
submission = pd.DataFrame(final_preds, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission.insert(0, "ID", test["ID"].values)
submission.to_csv("submission_catboost_pipeline.csv", index=False)

# Download submission
from google.colab import files
files.download("submission_catboost_pipeline.csv")


0:	learn: 0.9625451	total: 73ms	remaining: 2m 26s
100:	learn: 0.1605403	total: 4.81s	remaining: 1m 30s
200:	learn: 0.0766349	total: 10.9s	remaining: 1m 37s
300:	learn: 0.0520430	total: 15.6s	remaining: 1m 28s
400:	learn: 0.0401643	total: 20.5s	remaining: 1m 21s
500:	learn: 0.0323159	total: 26.5s	remaining: 1m 19s
600:	learn: 0.0262520	total: 31.2s	remaining: 1m 12s
700:	learn: 0.0213507	total: 37.3s	remaining: 1m 9s
800:	learn: 0.0174893	total: 42.1s	remaining: 1m 2s
900:	learn: 0.0144144	total: 47.8s	remaining: 58.3s
1000:	learn: 0.0119036	total: 52.9s	remaining: 52.8s
1100:	learn: 0.0099038	total: 57.7s	remaining: 47.1s
1200:	learn: 0.0082084	total: 1m 3s	remaining: 42.4s
1300:	learn: 0.0068514	total: 1m 8s	remaining: 36.8s
1400:	learn: 0.0056797	total: 1m 14s	remaining: 31.9s
1500:	learn: 0.0047969	total: 1m 19s	remaining: 26.4s
1600:	learn: 0.0040471	total: 1m 24s	remaining: 21s
1700:	learn: 0.0034019	total: 1m 30s	remaining: 15.9s
1800:	learn: 0.0029109	total: 1m 34s	remaining: 10

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, entropy
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
from google.colab import files

# Define feature engineering functions
def add_aggregate_features(df):
    property_cols = [col for col in df.columns if "_Property" in col]
    df["prop_mean"] = df[property_cols].mean(axis=1)
    df["prop_std"] = df[property_cols].std(axis=1)
    df["prop_min"] = df[property_cols].min(axis=1)
    df["prop_max"] = df[property_cols].max(axis=1)
    df["prop_range"] = df["prop_max"] - df["prop_min"]
    df["prop_median"] = df[property_cols].median(axis=1)
    return df

def add_cross_component_stats(df):
    for prop_idx in range(1, 11):
        cols = [f"Component{i}_Property{prop_idx}" for i in range(1, 6)]
        df[f"Property{prop_idx}_mean"] = df[cols].mean(axis=1)
        df[f"Property{prop_idx}_std"] = df[cols].std(axis=1)
        df[f"Property{prop_idx}_min"] = df[cols].min(axis=1)
        df[f"Property{prop_idx}_max"] = df[cols].max(axis=1)
        df[f"Property{prop_idx}_range"] = df[f"Property{prop_idx}_max"] - df[f"Property{prop_idx}_min"]
    return df

def add_rank_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
    return df

def add_entropy_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[f"Property{j}_entropy"] = df[cols].apply(
            lambda row: entropy(np.abs(row) / (np.sum(np.abs(row)) + 1e-6)), axis=1
        )
    return df

def add_weighted_stats(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        weights = [f"Component{i}_fraction" for i in range(1, 6)]
        weighted_mean = (df[cols] * df[weights]).sum(axis=1) / (df[weights].sum(axis=1) + 1e-6)
        df[f"Property{j}_weighted_variance"] = (
            (df[cols].sub(weighted_mean, axis=0) ** 2) * df[weights]
        ).sum(axis=1) / (df[weights].sum(axis=1) + 1e-6)
        df[f"Property{j}_skewness"] = df[cols].apply(skew, axis=1)
    return df

def add_ratio_features(df):
    for j in range(1, 11):
        cols = [f"Component{i}_Property{j}" for i in range(1, 6)]
        df[f"Property{j}_to_mean_ratio"] = df[cols].mean(axis=1) / (df["prop_mean"] + 1e-6)
    for i in range(1, 5):
        df[f"C{i}_to_C{i+1}_fraction_ratio"] = df[f"Component{i}_fraction"] / (df[f"Component{i+1}_fraction"] + 1e-6)
    return df

# Combined preprocess_features function (without clustering)
def preprocess_features(df):
    df = add_aggregate_features(df)
    df = add_cross_component_stats(df)
    df = add_rank_features(df)
    df = add_entropy_features(df)
    df = add_weighted_stats(df)
    df = add_ratio_features(df)
    return df

# Reload data to ensure clean state (replace with your actual file paths)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Apply feature engineering to train and test
train = preprocess_features(train)
test = preprocess_features(test)

# Prepare features and target
target_cols = [f"BlendProperty{i}" for i in range(1, 11)]
X = train.drop(columns=target_cols)
X_test = test[X.columns]
y = train[target_cols]

# Identify all numerical columns
numerical_cols = X.columns.tolist()

# Define column transformer to scale all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols)
    ]
)

# Define the CatBoost pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MultiOutputRegressor(CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        verbose=100,
        task_type='CPU'
    )))
])

# Fit the pipeline on training data
pipeline.fit(X, y)

# Predict on test data
final_preds = pipeline.predict(X_test)

# Create submission DataFrame
submission = pd.DataFrame(final_preds, columns=[f"BlendProperty{i}" for i in range(1, 11)])
submission.insert(0, "ID", test["ID"].values)
submission.to_csv("submission_catboost_pipeline.csv", index=False)

# Download submission
files.download("submission_catboost_pipeline.csv")

  df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
  df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
  df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
  df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
  df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
  df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
  df[[f"{col}_rank" for col in cols]] = df[cols].rank(axis=1, method='min')
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply(
  df[f"Property{j}_entropy"] = df[cols].apply

0:	learn: 0.9669755	total: 49ms	remaining: 49s
100:	learn: 0.2512002	total: 5.08s	remaining: 45.2s
200:	learn: 0.1416537	total: 8.72s	remaining: 34.6s
300:	learn: 0.0885929	total: 12.4s	remaining: 28.8s
400:	learn: 0.0660042	total: 17.4s	remaining: 25.9s
500:	learn: 0.0515918	total: 21s	remaining: 21s
600:	learn: 0.0405304	total: 24.7s	remaining: 16.4s
700:	learn: 0.0324201	total: 29.7s	remaining: 12.7s
800:	learn: 0.0265726	total: 33.3s	remaining: 8.28s
900:	learn: 0.0220015	total: 37s	remaining: 4.06s
999:	learn: 0.0181605	total: 41.9s	remaining: 0us
0:	learn: 0.9762720	total: 35.8ms	remaining: 35.7s
100:	learn: 0.2394279	total: 3.68s	remaining: 32.7s
200:	learn: 0.1289299	total: 7.32s	remaining: 29.1s
300:	learn: 0.0841033	total: 12.3s	remaining: 28.5s
400:	learn: 0.0625932	total: 16s	remaining: 23.9s
500:	learn: 0.0489894	total: 19.6s	remaining: 19.5s
600:	learn: 0.0396006	total: 24.5s	remaining: 16.3s
700:	learn: 0.0321938	total: 28.2s	remaining: 12s
800:	learn: 0.0262366	total: 3

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>