In [19]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time

In [20]:
data = pd.read_csv("cattle_data_train.csv")

features = data.iloc[:, 1:-1]
yields = data.iloc[:, -1]



In [21]:
# Feature Removal and Preprocessing
# Based on correlation analysis and data quality issues

# Features to remove (15 total):
features_to_remove = [
    'Feed_Quantity_lb',      # Duplicate of Feed_Quantity_kg (99.99% correlation)
    'Cattle_ID',             # Unique identifier, no predictive value
    'Rumination_Time_hrs',   # 55% negative values - data quality issue
    'HS_Vaccine',            # Very low correlation (0.000034)
    'BQ_Vaccine',            # Very low correlation (0.000466)
    'BVD_Vaccine',           # Very low correlation (0.000491)
    'Brucellosis_Vaccine',   # Very low correlation (0.002089)
    'FMD_Vaccine',           # Very low correlation (0.002477)
    # 'Resting_Hours',         # Nearly zero correlation (0.001653)
    'Housing_Score',         # Low correlation (0.004) + 3% missing values
    # 'Previous_Week_Avg_Yield',  # Similarity to another col
    # 'Feeding_Frequency',     # No correlation (0.000380)
    # 'Walking_Distance_km',   # No correlation (0.001538)
    # 'Body_Condition_Score',  # No correlation (0.001647)
    # 'Humidity_percent',      # Very low correlation (0.002153)
    # 'Grazing_Duration_hrs',  # Very low correlation (0.004350)
    # 'Milking_Interval_hrs'   # Very low correlation (0.014734)
]

# Remove features
data_cleaned = data.drop(columns=features_to_remove)

print(f"Original shape: {data.shape}")
print(f"Cleaned shape: {data_cleaned.shape}")
print(f"Removed {len(features_to_remove)} features")

Original shape: (210000, 36)
Cleaned shape: (210000, 27)
Removed 9 features


In [22]:
# Extract Season from Date column
# Analysis shows seasons have strong effect on milk yield:
#   - Spring: 16.59 L (+6.4% vs average) - BEST season
#   - Winter: 16.12 L (+3.4% vs average)
#   - Fall:   15.70 L (+0.7% vs average)
#   - Summer: 13.94 L (-10.6% vs average) - WORST season (heat stress)
#   - Range: 2.65 L difference between best and worst seasons!

# Convert Date to datetime
data_cleaned['Date'] = pd.to_datetime(data_cleaned['Date'])

# Extract month temporarily to create seasons
data_cleaned['Month'] = data_cleaned['Date'].dt.month

# Create Season feature (meteorological seasons)
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Fall'

data_cleaned['Season'] = data_cleaned['Month'].apply(get_season)

# Drop both Date and Month (we only keep Season)
data_cleaned = data_cleaned.drop(columns=['Date', 'Month'])

print("Replaced Date with Season:")
print("  - Season (Winter/Spring/Summer/Fall)")
print(f"\nSeason distribution:")
print(data_cleaned['Season'].value_counts().sort_index())
print(f"\nFinal shape: {data_cleaned.shape}")

Replaced Date with Season:
  - Season (Winter/Spring/Summer/Fall)

Season distribution:
Season
Fall      52425
Spring    53061
Summer    52663
Winter    51851
Name: count, dtype: int64

Final shape: (210000, 27)


In [23]:
# Update features and target using cleaned data
features = data_cleaned.drop(columns=['Milk_Yield_L'])
yields = data_cleaned['Milk_Yield_L']

print(f"Features shape: {features.shape}")
print(f"Target shape: {yields.shape}")

Features shape: (210000, 26)
Target shape: (210000,)


## Summary of Feature Selection

**Removed 16 features:**
1. Feed_Quantity_lb - duplicate of Feed_Quantity_kg (99.99% correlation)
2. Cattle_ID - unique identifier, no predictive value
3. Rumination_Time_hrs - data quality issue (55% negative values)
4-8. Low-correlation vaccines: HS, BQ, BVD, Brucellosis, FMD
9-15. Zero/near-zero correlation: Resting_Hours, Housing_Score, Feeding_Frequency, Walking_Distance_km, Body_Condition_Score, Humidity_percent, Grazing_Duration_hrs
16. Milking_Interval_hrs - very low correlation (0.015)

**Replaced Date with Season:**
- Removed: Date (raw timestamp)
- Added: Season (Winter/Spring/Summer/Fall)
- Rationale: Strong seasonal effect on milk yield (Spring: 16.59L vs Summer: 13.94L = 2.65L range)
- Month was NOT kept (redundant with Season - only 0.1L variation within seasons)

**Final: 19 features (down from 35 = 46% reduction)**

**Categorical (7):**
- Breed, Climate_Zone, Management_System, Lactation_Stage, Feed_Type, Farm_ID, Season

**Numeric (12):**
- Age_Months (corr: 0.31), Weight_kg (0.30), Parity (0.24), Days_in_Milk (0.06), Feed_Quantity_kg (0.22), Water_Intake_L (0.12), Ambient_Temperature_C (0.04), Anthrax_Vaccine (0.07), IBR_Vaccine (0.07), Rabies_Vaccine (0.07), Previous_Week_Avg_Yield (0.09), Mastitis (0.12)

**Why this works:**
- Removed noisy, low-correlation features
- Kept strong predictors (Age, Weight, Parity, Feed)
- Captured seasonal patterns without overfitting to specific months
- Cleaner data (fixed Rumination_Time_hrs corruption)

In [24]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

# Update categorical columns from cleaned data
cat_cols = features.select_dtypes(include=["object", "string"]).columns.tolist()
print(f"Categorical columns: {cat_cols}")
num_cols = [c for c in features.columns if c not in cat_cols]

Categorical columns: ['Breed', 'Climate_Zone', 'Management_System', 'Lactation_Stage', 'Feed_Type', 'Farm_ID', 'Season']


In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, mode="freq", m=5):
        self.cols = cols
        self.mode = mode
        self.m = m

    def fit(self, X, y=None):
        X = X.copy()
        self.maps = {}

        for col in self.cols:
            freq = X[col].value_counts()
            total = len(X)

            if self.mode == "freq":
                enc = freq / total
            elif self.mode == "count":
                enc = freq
            elif self.mode == "logfreq":
                enc = np.log1p(freq / total)
            elif self.mode == "smooth":
                prior = freq.sum() / total
                enc = (freq + self.m * prior) / (freq.sum() + self.m)
            else:
                raise ValueError("Unknown mode: " + self.mode)

            self.maps[col] = enc

        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols:
            X[col] = X[col].map(self.maps[col]).fillna(0)
        return X

In [26]:
import pandas as pd
def fill_missing_values(df):
    df_filled = df.copy()

    for col in df_filled.columns:
        if df_filled[col].dtype in ["float64", "int64"]:
            df_filled[col] = df_filled[col].fillna(df_filled[col].median())
        else:  
            df_filled[col] = df_filled[col].fillna(df_filled[col].mode()[0])
    
    return df_filled

from sklearn.impute import KNNImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd

def fill_with_similar_fast(df):
    imputer = IterativeImputer(
        random_state=42,
        max_iter=10,
        initial_strategy="most_frequent"
    )
    arr = imputer.fit_transform(df)
    return pd.DataFrame(arr, columns=df.columns)



def one_hot_encode(X, cat_cols):
    X = X.copy()
    X = pd.get_dummies(X, columns=cat_cols, drop_first=False)
    return X

features = fill_missing_values(features)
features = one_hot_encode(features, cat_cols)
print(features.shape)
# features = fill_with_similar_fast(features)

(210000, 1052)


In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import uniform

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

def make_pipeline(use_pca=True, n_components=None):
    steps = [("scaler", StandardScaler())]
    if use_pca:
        steps.append(("pca", PCA(n_components=n_components)))
    steps.append(("mlp", MLPRegressor(max_iter=200, early_stopping=True, tol=1e-3, random_state=42)))
    return Pipeline(steps)

param_space = {
    "pca__n_components": [50, 150, 300],
    "mlp__hidden_layer_sizes": [(32,), (64,), (64,32)],
    "mlp__activation": ["relu", "tanh", "logistic"],
    "mlp__alpha": uniform(loc=1e-5, scale=1e-3),
    "mlp__learning_rate_init": uniform(loc=1e-4, scale=1e-2)
}

pipe = make_pipeline(use_pca=True)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_space,
    n_iter=20,
    scoring=rmse_scorer,
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

search.fit(features, yields)

print("Best RMSE:", -search.best_score_)
print("Best params:", search.best_params_)


RMSE: 4.203492862909782
RMSE: 4.142938294772711
