In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
d=pd.read_csv('supply_chain.csv')

In [3]:
d.head()

Unnamed: 0,Farm_ID,Crop_Type,Planting_Date,Harvest_Date,Crop_Yield,Soil_Nitrogen,Soil_Phosphorus,Soil_Potassium,Fertilizer_Application,Market_ID,...,Storage_Capacity,Supply_Delay,Supply_Reliability,Fuel_Price,Economic_Index,Weather_Conditions,Crop_Suitability,Profit_Margin,Demand_Forecast,Market_Conditions
0,Farm_0,Wheat,2022-09-08,2022-06-02,433.120629,1.744877,0.788684,2.624458,High,Market_C,...,4273.209959,9.364963,0.381843,2.528511,0.525406,Sunny,High,70.995972,695.551245,Stable
1,Farm_1,Rice,2022-04-22,2022-08-19,171.645462,0.842294,0.534043,1.344645,Moderate,Market_D,...,7004.902069,0.360879,0.249128,1.629674,1.545662,Sunny,Low,26.66801,147.828407,Stable
2,Farm_2,Wheat,2022-10-12,2022-09-22,280.387939,0.313875,1.378451,2.943863,Moderate,Market_B,...,8324.904882,1.586866,0.075848,3.097474,0.650862,Rainy,Medium,57.249056,471.582811,Stable
3,Farm_3,Carrot,2022-10-16,2022-12-16,450.311426,0.256435,1.188903,2.284497,High,Market_D,...,8941.633242,4.006723,0.244438,4.575228,1.36285,Sunny,Low,67.382109,783.359511,Volatile
4,Farm_4,Soybean,2022-09-16,2022-06-21,119.50413,1.288418,1.386113,2.394433,,Market_D,...,1296.127019,3.671201,0.498452,2.671725,1.832056,Cloudy,Medium,43.168408,381.449613,Stable


In [4]:
df=d

In [5]:
df['Planting_Date'] = pd.to_datetime(df['Planting_Date'])
df['Harvest_Date'] = pd.to_datetime(df['Harvest_Date'])

# Feature Engineering: Calculate growing season duration
df['Growing_Season_Duration'] = (df['Harvest_Date'] - df['Planting_Date']).dt.days

In [6]:
df = df.drop(columns=['Planting_Date', 'Harvest_Date'])


In [7]:
df=df.dropna()

In [8]:
# Interaction Feature: Transport Cost per Kilometer
df['Transport_Cost_per_km'] = df['Transport_Cost'] / df['Distance']

# Interaction Feature: Estimated Revenue
df['Revenue_Estimate'] = df['Crop_Yield'] * df['Market_Price']


In [9]:
# Target Encoding for Crop_Type
df['Crop_Type_Target_Enc'] = df.groupby('Crop_Type')['Profit_Margin'].transform('mean')

# Target Encoding for Market_ID
df['Market_ID_Target_Enc'] = df.groupby('Market_ID')['Profit_Margin'].transform('mean')


In [10]:
# Calculate the mean and standard deviation of the Transport_Cost
mean_transport_cost = df['Transport_Cost'].mean()
std_transport_cost = df['Transport_Cost'].std()

# Define a threshold for detecting anomalies (e.g., 3 standard deviations from the mean)
threshold = 3 * std_transport_cost

# Anomaly Detection: Flag for high transport cost
df['High_Transport_Cost_Flag'] = np.where(df['Transport_Cost'] > mean_transport_cost + threshold, 1, 0)

# Remove outliers based on the flag
df = df[df['High_Transport_Cost_Flag'] == 0].copy()

# Drop the flag column if no longer needed
df = df.drop(columns=['High_Transport_Cost_Flag'])


In [11]:
from scipy import stats

# Calculate Z-scores for each numeric feature
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))

# Identify outliers (where Z-score > threshold, e.g., 3)
outliers = (z_scores > 3).any(axis=1)

# Remove outliers
df = df[~outliers].copy()


In [12]:
X = df.drop(columns=['Profit_Margin','Farm_ID','Market_ID','Route_ID','Crop_Type'])
y = df['Profit_Margin']


In [13]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()


In [14]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [15]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [17]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=150, random_state=42))
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
pipeline.fit(X_train, y_train)

In [20]:
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [21]:
print(f"Mean Squared Error: {mse}")
print("Feature Importances:", pipeline.named_steps['model'].feature_importances_)

Mean Squared Error: 688.5930293441199
Feature Importances: [0.03931334 0.04606677 0.04532752 0.04583428 0.04526935 0.04156923
 0.0456312  0.04010461 0.04074572 0.04544048 0.04571049 0.04546205
 0.04520742 0.04566283 0.04551043 0.04588011 0.04576963 0.04584934
 0.04367128 0.0370765  0.03708471 0.01616331 0.01139721 0.00406431
 0.00400601 0.00408936 0.00398735 0.00418086 0.00400506 0.00400439
 0.00419705 0.00408205 0.00402766 0.00402879 0.00395681 0.00281635
 0.00280617]


In [23]:
joblib.dump(pipeline, 'pipeline_model.joblib')

['pipeline_model.joblib']