In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import randint, uniform

In [2]:
data = pd.read_csv("cattle_data_train.csv")

features = data.iloc[:, 1:-1]
yields = data.iloc[:, -1]

print(features.shape)
print(yields.shape)

(210000, 34)
(210000,)


In [3]:
from sklearn.preprocessing import LabelEncoder

def drop_high_corr_and_missing(df, corr_threshold=0.95, missing_threshold=0.1):
    df_original = df.copy()
    df_encoded = df.copy()

    for col in df_encoded.columns:
        if df_encoded[col].dtype == "object" or df_encoded[col].dtype.name == "category":
            df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))

    corr = df_encoded.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop_corr = [col for col in upper.columns if any(upper[col] > corr_threshold)]

    to_drop_missing = df_original.columns[df_original.isna().mean() > missing_threshold].tolist()

    to_drop = list(set(to_drop_corr + to_drop_missing))

    df_reduced = df_original.drop(columns=to_drop)
    return df_reduced, to_drop

reduced, dropped = drop_high_corr_and_missing(features, corr_threshold=0.95, missing_threshold=0.05)
print("Dropped columns:", dropped)
print("Reduced shape:", reduced.shape)
reduced.head()



Dropped columns: ['Feed_Quantity_lb', 'Previous_Week_Avg_Yield']
Reduced shape: (210000, 32)


Unnamed: 0,Breed,Climate_Zone,Management_System,Age_Months,Weight_kg,Parity,Lactation_Stage,Days_in_Milk,Feed_Type,Feed_Quantity_kg,...,BQ_Vaccine,Anthrax_Vaccine,IBR_Vaccine,BVD_Vaccine,Rabies_Vaccine,Body_Condition_Score,Milking_Interval_hrs,Date,Farm_ID,Mastitis
0,Holstein,Tropical,Intensive,114,544.8,4,Mid,62,Concentrates,16.363455,...,0,0,1,0,1,3.0,12,2024-01-15,FARM_0301,1
1,Holstein,Arid,Mixed,136,298.9,4,Mid,213,Crop_Residues,,...,1,0,0,0,0,4.0,12,2023-10-31,FARM_0219,0
2,Holstein,Tropical,Semi_Intensive,64,336.6,4,Late,16,Hay,7.198607,...,1,0,1,1,0,3.5,12,2024-05-20,FARM_0802,0
3,Jersey,Mediterranean,Intensive,58,370.5,1,Early,339,Crop_Residues,18.694344,...,0,1,0,0,0,3.0,24,2024-07-22,FARM_0034,0
4,Guernsey,Subtropical,Intensive,84,641.5,6,Early,125,Mixed_Feed,14.779198,...,1,1,0,1,1,3.0,12,2023-01-03,FARM_0695,1


In [4]:
def fill_missing_values(df):
    df_filled = df.copy()

    for col in df_filled.columns:
        if df_filled[col].dtype in ["float64", "int64"]:
            df_filled[col] = df_filled[col].fillna(df_filled[col].median())
        else:  
            df_filled[col] = df_filled[col].fillna(df_filled[col].mode()[0])
    
    return df_filled

features_filled = fill_missing_values(reduced)
features_filled.isna().sum()  

cat_cols = [c for c in features_filled.columns if features[c].dtype == "object"]
num_cols = [c for c in features_filled.columns if c not in cat_cols]

from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

encoders = {
    "onehot": OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    "ordinal": OrdinalEncoder()
}

In [5]:
from scipy.stats import uniform

def make_pipeline(encoder, use_pca=False):
    steps = [
        ("preprocess", ColumnTransformer([
            ("num", StandardScaler(), num_cols),
            ("cat", encoder, cat_cols)
        ], remainder="drop"))
    ]
    if use_pca:
        steps.append(("pca", PCA(n_components=10)))
    steps.append(("mlp", MLPRegressor(max_iter=200, early_stopping=True, tol=1e-3, random_state=42)))
    return Pipeline(steps)

param_space = {
    "mlp__hidden_layer_sizes": [(32,), (64,), (64,32)],
    "mlp__activation": ["relu", "tanh"],
    "mlp__alpha": uniform(loc=1e-5, scale=1e-3),
    "mlp__learning_rate_init": uniform(loc=1e-4, scale=1e-2)
}

results = {}

for enc_name, enc in encoders.items():
    pipe = make_pipeline(enc, use_pca=False)
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_space,
        n_iter=10,
        scoring=rmse_scorer,
        cv=3,
        random_state=42,
        verbose=2,
        n_jobs=1
    )
    search.fit(features_filled, yields)
    results[enc_name] = search.best_score_
    print(f"Best RMSE for {enc_name}: {search.best_score_}")
    print(f"Best params: {search.best_params_}\n")

print(results)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END mlp__activation=relu, mlp__alpha=0.0008065429868602329, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.007419939418114052; total time=  59.1s
[CV] END mlp__activation=relu, mlp__alpha=0.0008065429868602329, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.007419939418114052; total time= 1.6min
[CV] END mlp__activation=relu, mlp__alpha=0.0008065429868602329, mlp__hidden_layer_sizes=(64, 32), mlp__learning_rate_init=0.007419939418114052; total time= 2.5min
[CV] END mlp__activation=relu, mlp__alpha=0.000606850157946487, mlp__hidden_layer_sizes=(64,), mlp__learning_rate_init=0.0016599452033620266; total time=  47.3s
[CV] END mlp__activation=relu, mlp__alpha=0.000606850157946487, mlp__hidden_layer_sizes=(64,), mlp__learning_rate_init=0.0016599452033620266; total time=  46.6s
[CV] END mlp__activation=relu, mlp__alpha=0.000606850157946487, mlp__hidden_layer_sizes=(64,), mlp__learning_rate_init=0.0016