In [29]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time

In [30]:
data = pd.read_csv("cattle_data_train.csv")

features = data.iloc[:, 1:-1]
yields = data.iloc[:, -1]



In [31]:
# Feature Removal and Preprocessing
# Based on correlation analysis and data quality issues

# Features to remove:
features_to_remove = [
    'Feed_Quantity_lb',      # Duplicate of Feed_Quantity_kg (99.99% correlation)
    'Cattle_ID',             # Unique identifier, no predictive value
    'Rumination_Time_hrs',   # 55% negative values - data quality issue
    'HS_Vaccine',            # Very low correlation (0.000034)
    'BQ_Vaccine',            # Very low correlation (0.000466)
    'BVD_Vaccine',           # Very low correlation (0.000491)
    'Brucellosis_Vaccine',   # Very low correlation (0.002089)
    'FMD_Vaccine',           # Very low correlation (0.002477)
    'Anthrax_Vaccine', 
    'IBR_Vaccine', 
    'Rabies_Vaccine',
    'Walking_Distance_km',
    'Climate_Zone',
    'Farm_ID',
    'Resting_Hours',         # Nearly zero correlation (0.001653)
    'Housing_Score',         # Low correlation (0.004) + 3% missing values
    'Feeding_Frequency',     # No correlation (0.000380)
    'Body_Condition_Score',  # No correlation (0.001647)
    'Humidity_percent',      # Very low correlation (0.002153)
    'Grazing_Duration_hrs',  # Very low correlation (0.004350)
    'Milking_Interval_hrs'   # Very low correlation (0.014734)
]

# Remove features
data_cleaned = data.drop(columns=features_to_remove)

print(f"Original shape: {data.shape}")
print(f"Cleaned shape: {data_cleaned.shape}")
print(f"Removed {len(features_to_remove)} features")

Original shape: (210000, 36)
Cleaned shape: (210000, 15)
Removed 21 features


In [32]:
# Extract Season from Date column
# Analysis shows seasons have strong effect on milk yield:
#   - Spring: 16.59 L (+6.4% vs average) - BEST season
#   - Winter: 16.12 L (+3.4% vs average)
#   - Fall:   15.70 L (+0.7% vs average)
#   - Summer: 13.94 L (-10.6% vs average) - WORST season (heat stress)
#   - Range: 2.65 L difference between best and worst seasons!

# Convert Date to datetime
data_cleaned['Date'] = pd.to_datetime(data_cleaned['Date'])

# Extract month temporarily to create seasons
data_cleaned['Month'] = data_cleaned['Date'].dt.month

# Create Season feature (meteorological seasons)
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Fall'

data_cleaned['Season'] = data_cleaned['Month'].apply(get_season)

# Drop both Date and Month (we only keep Season)
data_cleaned = data_cleaned.drop(columns=['Date', 'Month'])

print("Replaced Date with Season:")
print("  - Season (Winter/Spring/Summer/Fall)")
print(f"\nSeason distribution:")
print(data_cleaned['Season'].value_counts().sort_index())
print(f"\nFinal shape: {data_cleaned.shape}")

Replaced Date with Season:
  - Season (Winter/Spring/Summer/Fall)

Season distribution:
Season
Fall      52425
Spring    53061
Summer    52663
Winter    51851
Name: count, dtype: int64

Final shape: (210000, 15)


In [33]:
# Update features and target using cleaned data
features = data_cleaned.drop(columns=['Milk_Yield_L'])
yields = data_cleaned['Milk_Yield_L']

print(f"Features shape: {features.shape}")
print(f"Target shape: {yields.shape}")

Features shape: (210000, 14)
Target shape: (210000,)


## Summary of Feature Selection

**Removed 16 features:**
1. Feed_Quantity_lb - duplicate of Feed_Quantity_kg (99.99% correlation)
2. Cattle_ID - unique identifier, no predictive value
3. Rumination_Time_hrs - data quality issue (55% negative values)
4-8. Low-correlation vaccines: HS, BQ, BVD, Brucellosis, FMD
9-15. Zero/near-zero correlation: Resting_Hours, Housing_Score, Feeding_Frequency, Walking_Distance_km, Body_Condition_Score, Humidity_percent, Grazing_Duration_hrs
16. Milking_Interval_hrs - very low correlation (0.015)

**Replaced Date with Season:**
- Removed: Date (raw timestamp)
- Added: Season (Winter/Spring/Summer/Fall)
- Rationale: Strong seasonal effect on milk yield (Spring: 16.59L vs Summer: 13.94L = 2.65L range)
- Month was NOT kept (redundant with Season - only 0.1L variation within seasons)

**Final: 19 features (down from 35 = 46% reduction)**

**Categorical (7):**
- Breed, Climate_Zone, Management_System, Lactation_Stage, Feed_Type, Farm_ID, Season

**Numeric (12):**
- Age_Months (corr: 0.31), Weight_kg (0.30), Parity (0.24), Days_in_Milk (0.06), Feed_Quantity_kg (0.22), Water_Intake_L (0.12), Ambient_Temperature_C (0.04), Anthrax_Vaccine (0.07), IBR_Vaccine (0.07), Rabies_Vaccine (0.07), Previous_Week_Avg_Yield (0.09), Mastitis (0.12)

**Why this works:**
- Removed noisy, low-correlation features
- Kept strong predictors (Age, Weight, Parity, Feed)
- Captured seasonal patterns without overfitting to specific months
- Cleaner data (fixed Rumination_Time_hrs corruption)

In [34]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

# Update categorical columns from cleaned data
cat_cols = features.select_dtypes(include=["object", "string"]).columns.tolist()
print(f"Categorical columns: {cat_cols}")

Categorical columns: ['Breed', 'Management_System', 'Lactation_Stage', 'Feed_Type', 'Season']


In [35]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, mode="freq", m=5):
        self.cols = cols
        self.mode = mode
        self.m = m

    def fit(self, X, y=None):
        X = X.copy()
        self.maps = {}

        for col in self.cols:
            freq = X[col].value_counts()
            total = len(X)

            if self.mode == "freq":
                enc = freq / total
            elif self.mode == "count":
                enc = freq
            elif self.mode == "logfreq":
                enc = np.log1p(freq / total)
            elif self.mode == "smooth":
                prior = freq.sum() / total
                enc = (freq + self.m * prior) / (freq.sum() + self.m)
            else:
                raise ValueError("Unknown mode: " + self.mode)

            self.maps[col] = enc

        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols:
            X[col] = X[col].map(self.maps[col]).fillna(0)
        return X

In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.impute import SimpleImputer
from scipy.stats import uniform

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

num_cols = features.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = features.select_dtypes(include=["object", "category"]).columns.tolist()

num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
])

pipe = Pipeline([
    ("preprocess", preprocessor),
    ("pca", PCA()),
    ("mlp", MLPRegressor(max_iter=200, early_stopping=True, tol=1e-3, random_state=42))
])

param_distributions = {
    "pca__n_components": [15, 20, 25],
    "mlp__hidden_layer_sizes": [(32,), (64,), (64,32)],
    "mlp__activation": ["relu", "tanh"],
    "mlp__alpha": uniform(loc=1e-5, scale=1e-3),
    "mlp__learning_rate_init": uniform(loc=1e-4, scale=1e-2)
}

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=10,
    scoring=rmse_scorer,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, yields, test_size=0.2, random_state=42
)

search.fit(X_train, y_train)

for i, params in enumerate(search.cv_results_['params']):
    fold_scores = search.cv_results_['split0_test_score'][i], search.cv_results_['split1_test_score'][i], \
                  search.cv_results_['split2_test_score'][i], search.cv_results_['split3_test_score'][i], \
                  search.cv_results_['split4_test_score'][i]
    fold_scores = [-s for s in fold_scores]  # convert to positive RMSE
    print(f"Params: {params}")
    print(f"RMSE for each fold: {fold_scores}")
    print(f"Mean RMSE: {np.mean(fold_scores)}\n")

print("Best RMSE:", -search.best_score_)
print("Best params:", search.best_params_)

print("Best RMSE:", -search.best_score_)
print("Best params:", search.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Params: {'mlp__activation': 'relu', 'mlp__alpha': 0.0008065429868602329, 'mlp__hidden_layer_sizes': (64, 32), 'mlp__learning_rate_init': 0.007419939418114052, 'pca__n_components': 15}
RMSE for each fold: [4.173784891283524, 4.165428799238827, 4.178452504586921, 4.181477895758046, 4.1837574221060505]
Mean RMSE: 4.176580302594674

Params: {'mlp__activation': 'relu', 'mlp__alpha': 0.00016601864044243653, 'mlp__hidden_layer_sizes': (64, 32), 'mlp__learning_rate_init': 0.0010997491581800289, 'pca__n_components': 25}
RMSE for each fold: [4.197304631618844, 4.219727144643255, 4.20507239570264, 4.232050734069129, 4.2059231030443245]
Mean RMSE: 4.212015601815638

Params: {'mlp__activation': 'tanh', 'mlp__alpha': 0.00034370861113902185, 'mlp__hidden_layer_sizes': (64, 32), 'mlp__learning_rate_init': 0.0003058449429580245, 'pca__n_components': 20}
RMSE for each fold: [4.199397589908928, 4.190496047333664, 4.180258300721461, 4.1780624260

In [37]:
import joblib

best_params = search.best_params_

final_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("pca", PCA(n_components=best_params["pca__n_components"])),
    ("mlp", MLPRegressor(
        hidden_layer_sizes=best_params["mlp__hidden_layer_sizes"],
        activation=best_params["mlp__activation"],
        alpha=best_params["mlp__alpha"],
        learning_rate_init=best_params["mlp__learning_rate_init"],
        early_stopping=True,
        max_iter=200,
        random_state=42
    ))
])

final_pipe.fit(X_train, y_train)

y_pred = final_pipe.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

joblib.dump(final_pipe, "no_farm_mlp_cattle_model.pkl")

RMSE: 4.191633002956586


['no_farm_mlp_cattle_model.pkl']

In [38]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
import joblib

num_cols = features.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = features.select_dtypes(include=["object", "category"]).columns.tolist()

num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
])

best_params = search.best_params_

pipe = Pipeline([
    ("preprocess", preprocessor),
    ("pca", PCA(n_components=best_params["pca__n_components"])),
    ("mlp", MLPRegressor(
        hidden_layer_sizes=best_params["mlp__hidden_layer_sizes"],
        activation=best_params["mlp__activation"],
        alpha=best_params["mlp__alpha"],
        learning_rate_init=best_params["mlp__learning_rate_init"],
        early_stopping=True,
        max_iter=200,
        random_state=42
    ))
])

pipe.fit(features, yields)

joblib.dump(pipe, "mlp_cattle_model_final2.pkl")
print("Saved mlp_cattle_model_final2.pkl")


Saved mlp_cattle_model_final2.pkl


In [39]:
test_data = pd.read_csv("cattle_data_test.csv")
test_data_cleaned = test_data.drop(columns=features_to_remove)

test_data_cleaned['Date'] = pd.to_datetime(test_data_cleaned['Date'])
test_data_cleaned['Month'] = test_data_cleaned['Date'].dt.month
test_data_cleaned['Season'] = test_data_cleaned['Month'].apply(get_season)
test_data_cleaned = test_data_cleaned.drop(columns=['Date', 'Month'])

test_features = test_data_cleaned

print(test_features.shape)
# test_features.head()


(40000, 14)


In [40]:
import joblib
import pandas as pd

pipeline = joblib.load("mlp_cattle_model_final2.pkl")

predictions = pipeline.predict(test_features)

output = pd.DataFrame({
    "Cattle_ID": test_data["Cattle_ID"],
    "Milk_Yield_L": predictions
})

output.to_csv("milk_yield_predictions4.csv", index=False)
print("Predictions saved to milk_yield_predictions4.csv")


Predictions saved to milk_yield_predictions4.csv
