In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time

In [2]:
data = pd.read_csv("cattle_data_train.csv")

features = data.iloc[:, 1:-1]
yields = data.iloc[:, -1]



In [None]:
# Feature Removal and Preprocessing
# Based on correlation analysis and data quality issues

# Features to remove:
features_to_remove = [
    'Feed_Quantity_lb',      # Duplicate of Feed_Quantity_kg (99.99% correlation)
    'Cattle_ID',             # Unique identifier, no predictive value
   # 'Date',                  # Raw date adds noise (could extract temporal features if needed)
    'Rumination_Time_hrs',   # 55% negative values - data quality issue
    'HS_Vaccine',            # Very low correlation with target (0.000034)
    'BQ_Vaccine',            # Very low correlation with target (0.000466)
    'BVD_Vaccine',           # Very low correlation with target (0.000491)
    'Brucellosis_Vaccine',   # Very low correlation with target (0.002089)
    'FMD_Vaccine',           # Very low correlation with target (0.002477)
    'Resting_Hours',         # Nearly zero correlation (0.001653)
    'Housing_Score',         # Low correlation (0.004) + 3% missing values
    'Feeding_Frequency',     # No correlation (0.000380)
    'Walking_Distance_km',   # No correlation (0.001538)
    'Body_Condition_Score',  # No correlation (0.001647)
    'Humidity_percent'       # Very low correlation (0.002153)
]

# Remove features
data_cleaned = data.drop(columns=features_to_remove)

print(f"Original shape: {data.shape}")
print(f"Cleaned shape: {data_cleaned.shape}")
print(f"Removed {len(features_to_remove)} features")
print(f"\nRemaining columns: {data_cleaned.columns.tolist()}")

In [None]:
# Update features and target using cleaned data
features = data_cleaned.drop(columns=['Milk_Yield_L'])
yields = data_cleaned['Milk_Yield_L']

print(f"Features shape: {features.shape}")
print(f"Target shape: {yields.shape}")

## Summary of Feature Selection

**Removed 14 features:**
1. Feed_Quantity_lb - duplicate of Feed_Quantity_kg
2. Cattle_ID - unique identifier
3. Rumination_Time_hrs - data quality issue (55% negative values)
5-9. Low-correlation vaccines: HS, BQ, BVD, Brucellosis, FMD
10-15. Zero/near-zero correlation: Resting_Hours, Housing_Score, Feeding_Frequency, Walking_Distance_km, Body_Condition_Score, Humidity_percent

**Kept 21 features:**
- **Categorical (7)**: Breed, Climate_Zone, Management_System, Lactation_Stage, Feed_Type, Farm_ID, Date
- **Numeric (14)**: Age_Months, Weight_kg, Parity, Days_in_Milk, Feed_Quantity_kg, Water_Intake_L, Grazing_Duration_hrs, Ambient_Temperature_C, Anthrax_Vaccine, IBR_Vaccine, Rabies_Vaccine, Previous_Week_Avg_Yield, Milking_Interval_hrs, Mastitis

**Note**: Farm_ID could potentially be removed if you want to generalize across farms, but it may capture farm-specific management practices.

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

# Update categorical columns from cleaned data
cat_cols = features.select_dtypes(include=["object", "string"]).columns.tolist()
print(f"Categorical columns: {cat_cols}")

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, mode="freq", m=5):
        self.cols = cols
        self.mode = mode
        self.m = m

    def fit(self, X, y=None):
        X = X.copy()
        self.maps = {}

        for col in self.cols:
            freq = X[col].value_counts()
            total = len(X)

            if self.mode == "freq":
                enc = freq / total
            elif self.mode == "count":
                enc = freq
            elif self.mode == "logfreq":
                enc = np.log1p(freq / total)
            elif self.mode == "smooth":
                prior = freq.sum() / total
                enc = (freq + self.m * prior) / (freq.sum() + self.m)
            else:
                raise ValueError("Unknown mode: " + self.mode)

            self.maps[col] = enc

        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols:
            X[col] = X[col].map(self.maps[col]).fillna(0)
        return X