In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [30]:
df = pd.read_csv("/content/Cost_data.csv")

In [31]:
df.head()

Unnamed: 0,Time_Period,Crop_type,State,Cost,Cost_of_Production,Operational_Cost,Fixed_Cost
0,2021_22,Paddy,Maharashtra,97597.0,4376,81228,26184
1,2021_22,Maize,Maharashtra,92343.0,1428,66158,26184
2,2021_22,Wheat,Maharashtra,69000.0,2814,53559,15360
3,2021_22,Jowar,Maharashtra,51536.0,3714,40158,11378
4,2021_22,Bajra,Maharashtra,72926.0,2696,58926,14000


In [32]:
sorted_df = df.sort_values(by="Crop_type")

In [33]:
sorted_df.head()

Unnamed: 0,Time_Period,Crop_type,State,Cost,Cost_of_Production,Operational_Cost,Fixed_Cost
24,2019_20,Bajra,Maharashtra,72926.0,2145,42362,15130
44,2017_18,Bajra,Maharashtra,48500.0,2114,34787,13698
34,2018_19,Bajra,Maharashtra,61070.0,1680,44632,16380
4,2021_22,Bajra,Maharashtra,72926.0,2696,58926,14000
14,2020_21,Bajra,Maharashtra,62228.0,2475,49475,12752


In [34]:
sorted_df.shape

(50, 7)

In [35]:
pd.set_option('display.max_rows', None)

In [36]:
sorted_df

Unnamed: 0,Time_Period,Crop_type,State,Cost,Cost_of_Production,Operational_Cost,Fixed_Cost
24,2019_20,Bajra,Maharashtra,72926.0,2145,42362,15130
44,2017_18,Bajra,Maharashtra,48500.0,2114,34787,13698
34,2018_19,Bajra,Maharashtra,61070.0,1680,44632,16380
4,2021_22,Bajra,Maharashtra,72926.0,2696,58926,14000
14,2020_21,Bajra,Maharashtra,62228.0,2475,49475,12752
39,2018_19,Cotton,Maharashtra,89570.0,6155,62408,22333
29,2019_20,Cotton,Maharashtra,84103.0,6126,62831,18368
19,2020_21,Cotton,Maharashtra,80900.0,6445,63840,79372
9,2021_22,Cotton,Maharashtra,98962.0,8450,76303,18406
49,2017_18,Cotton,Maharashtra,82930.0,3495,63275,19657


In [37]:
sorted_df.isna().sum()

Unnamed: 0,0
Time_Period,0
Crop_type,0
State,0
Cost,1
Cost_of_Production,0
Operational_Cost,0
Fixed_Cost,0


In [38]:
sorted_df['Cost'] = sorted_df.groupby('Crop_type')['Cost'].transform(lambda x: x.fillna(x.mean()))

In [39]:
# Data Preprocessing
def preprocess_data(df):
    # Convert Time_Period to numeric (extract year)
    df['Year'] = df['Time_Period'].apply(lambda x: int(x.split('_')[0]))

    # Encode categorical variables
    le = LabelEncoder()
    df['Crop_type_encoded'] = le.fit_transform(df['Crop_type'])

    # Create feature matrix X and target variable y
    X = df[['Year', 'Crop_type_encoded', "Cost_of_Production", "Operational_Cost", "Fixed_Cost"]]
    y = df['Cost']

    return X, y, le

In [40]:
X, y, label_encoder = preprocess_data(sorted_df)

In [41]:
# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
def Hypertuning(train_inputs, train_targets):
  params = {
      'n_estimators': [75, 100],
      'max_depth': [None, 1, 2, 3],
      'min_samples_split': [2, 3, 4, 5]
  }

  cv = 5
  rf_regressor = RandomForestRegressor()

  rf_grid = GridSearchCV(estimator= rf_regressor, param_grid=params, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

  rf_grid.fit(train_inputs, train_targets)

  return rf_grid.best_params_

In [48]:
def train_and_evaluate_model(X, y, random_state=369):

    # Perform hyperparameter tuning
    best_params = Hypertuning(X, y)
    print(f"Best parameters: {best_params}")

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X)
    # X_test_scaled = scaler.transform(X_test)

    # Train model using best parameters
    rf_model = RandomForestRegressor(**best_params, random_state=random_state)
    rf_model.fit(X_train_scaled, y)

    # Make predictions
    y_train_pred = rf_model.predict(X_train_scaled)
    # y_test_pred = rf_model.predict(X_test_scaled)

    # Calculate metrics
    train_mse = mean_squared_error(y, y_train_pred)
    # test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y, y_train_pred)
    # test_r2 = r2_score(y_test, y_test_pred)

    return rf_model, scaler, train_r2

In [50]:
model, scaler, r2 = train_and_evaluate_model(X, y)

Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 75}


In [54]:
def analyze_feature_importance(model, feature_names):
    importances = model.feature_importances_
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })
    return importance_df.sort_values('Importance', ascending=False)

In [55]:
def predict_crop_cost(model, scaler, le, year, crop_type, Cost_of_Production, Operational_Cost, Fixed_Cost):
    crop_encoded = le.transform([crop_type])[0]
    features = np.array([[year, crop_encoded, Cost_of_Production, Operational_Cost, Fixed_Cost]])
    features_scaled = scaler.transform(features)
    prediction = model.predict(features_scaled)[0]

    return prediction

In [58]:
year = input("Enter the year: ")
crop_type = input("Enter the crop type: ")
Cost_of_Production = float(input("Enter the cost of production: "))
Operational_Cost = float(input("Enter the operational cost: "))
Fixed_Cost = float(input("Enter the fixed cost: "))

Enter the year: 2019
Enter the crop type: Paddy
Enter the cost of production: 3268
Enter the operational cost: 74000
Enter the fixed cost: 14565


In [59]:
prediction = predict_crop_cost(model, scaler, label_encoder, year, crop_type, Cost_of_Production, Operational_Cost, Fixed_Cost)

print(f"\nPredicted cost for {crop_type} in {year}: ₹{prediction:.2f}")


Predicted cost for Paddy in 2019: ₹89540.73
