In [None]:
import gdown
import pandas as pd
file_id = '1MCmntA3BOquC7BWxLcYDItohDgTPRKx8'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'Crop_Data_Final.csv'
gdown.download(url, output, quiet=False)
df = pd.read_csv('Crop_Data_Final.csv')

Downloading...
From: https://drive.google.com/uc?id=1MCmntA3BOquC7BWxLcYDItohDgTPRKx8
To: /content/Crop_Data_Final.csv
100%|██████████| 214k/214k [00:00<00:00, 4.22MB/s]


In [None]:
df.columns

Index(['Year', 'Dist Name', 'RICE AREA (1000 ha)',
       'RICE PRODUCTION (1000 tons)', 'RICE YIELD (Kg per ha)',
       'WHEAT AREA (1000 ha)', 'WHEAT PRODUCTION (1000 tons)',
       'WHEAT YIELD (Kg per ha)', 'SORGHUM AREA (1000 ha)',
       'SORGHUM PRODUCTION (1000 tons)', 'SORGHUM YIELD (Kg per ha)',
       'PEARL MILLET AREA (1000 ha)', 'PEARL MILLET PRODUCTION (1000 tons)',
       'PEARL MILLET YIELD (Kg per ha)', 'MAIZE AREA (1000 ha)',
       'MAIZE PRODUCTION (1000 tons)', 'MAIZE YIELD (Kg per ha)',
       'CHICKPEA AREA (1000 ha)', 'CHICKPEA PRODUCTION (1000 tons)',
       'CHICKPEA YIELD (Kg per ha)', 'PIGEONPEA AREA (1000 ha)',
       'PIGEONPEA PRODUCTION (1000 tons)', 'PIGEONPEA YIELD (Kg per ha)',
       'MINOR PULSES AREA (1000 ha)', 'MINOR PULSES PRODUCTION (1000 tons)',
       'MINOR PULSES YIELD (Kg per ha)', 'GROUNDNUT AREA (1000 ha)',
       'GROUNDNUT PRODUCTION (1000 tons)', 'GROUNDNUT YIELD (Kg per ha)',
       'SESAMUM AREA (1000 ha)', 'SESAMUM PRODUCTION (10

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

# Load the dataset
file_path = '/content/Crop_Data_Final.csv'
data = pd.read_csv(file_path)

# Future years to forecast
future_years = list(range(2017, 2024)) + [2025, 2030, 2035, 2040, 2045, 2050]

# Districts to process
districts = data['Dist Name'].unique()

# Crop-related target columns
crop_columns = {
    'Rice': ['RICE AREA (1000 ha)', 'RICE PRODUCTION (1000 tons)', 'RICE YIELD (Kg per ha)'],
    'Wheat': ['WHEAT AREA (1000 ha)', 'WHEAT PRODUCTION (1000 tons)', 'WHEAT YIELD (Kg per ha)'],
    'Sorghum': ['SORGHUM AREA (1000 ha)', 'SORGHUM PRODUCTION (1000 tons)', 'SORGHUM YIELD (Kg per ha)'],
    'Pearl Millet': ['PEARL MILLET AREA (1000 ha)', 'PEARL MILLET PRODUCTION (1000 tons)', 'PEARL MILLET YIELD (Kg per ha)'],
    'Maize': ['MAIZE AREA (1000 ha)', 'MAIZE PRODUCTION (1000 tons)', 'MAIZE YIELD (Kg per ha)'],
    'Chickpea': ['CHICKPEA AREA (1000 ha)', 'CHICKPEA PRODUCTION (1000 tons)', 'CHICKPEA YIELD (Kg per ha)'],
    'Pigeonpea': ['PIGEONPEA AREA (1000 ha)', 'PIGEONPEA PRODUCTION (1000 tons)', 'PIGEONPEA YIELD (Kg per ha)'],
    'Minor Pulses': ['MINOR PULSES AREA (1000 ha)', 'MINOR PULSES PRODUCTION (1000 tons)', 'MINOR PULSES YIELD (Kg per ha)'],
    'Groundnut': ['GROUNDNUT AREA (1000 ha)', 'GROUNDNUT PRODUCTION (1000 tons)', 'GROUNDNUT YIELD (Kg per ha)'],
    'Sesamum': ['SESAMUM AREA (1000 ha)', 'SESAMUM PRODUCTION (1000 tons)', 'SESAMUM YIELD (Kg per ha)'],
    'Oilseeds': ['OILSEEDS AREA (1000 ha)', 'OILSEEDS PRODUCTION (1000 tons)', 'OILSEEDS YIELD (Kg per ha)'],
    'Sugarcane': ['SUGARCANE AREA (1000 ha)', 'SUGARCANE PRODUCTION (1000 tons)', 'SUGARCANE YIELD (Kg per ha)'],
    'Cotton': ['COTTON AREA (1000 ha)', 'COTTON PRODUCTION (1000 tons)', 'COTTON YIELD (Kg per ha)'],
    'Fruits and Vegetables': ['FRUITS AND VEGETABLES AREA (1000 ha)'],
    'Fertilizers': ['NITROGEN SHARE IN NPK (Percent)', 'PHOSPHATE SHARE IN NPK (Percent)', 'POTASH SHARE IN NPK (Percent)'],
    'Soil Nutrients': ['NITROGEN PER HA OF NCA (Kg per ha)', 'NITROGEN PER HA OF GCA (Kg per ha)',
                       'PHOSPHATE PER HA OF NCA (Kg per ha)', 'PHOSPHATE PER HA OF GCA (Kg per ha)',
                       'POTASH PER HA OF NCA (Kg per ha)', 'POTASH PER HA OF GCA (Kg per ha)'],
    'Weather': ['Min Temp (Centigrate)', 'Max Temp (Centigrate)', 'Precipitation (mm)', 'Irrigated Area', 'Annual Rainfall']
}

# Hyperparameter tuning grid
param_distributions = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'subsample': [0.8, 1.0]
}

# Function to calculate additional evaluation metrics (MAE, MAPE, RMSE)
def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # MAPE in percentage
    rmse = np.sqrt(mse)
    return mse, r2, mae, mape, rmse

# Function to process each district and target column
def process_district(district):
    results = {'fine_tuning': [], 'forecasting': []}
    district_data = data[data['Dist Name'] == district]

    # Iterate over each crop
    for crop, columns in crop_columns.items():
        # Only focus on the columns for the current crop
        X = district_data.drop(columns=['Year', 'Dist Name'] + [col for col in data.columns if col not in columns], errors='ignore')

        for target_column in columns:
            y = district_data[target_column]

            # Skip columns with insufficient data
            if len(y) < 5:
                continue

            # Split the data into train and test sets (80:20 split)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            # Randomized search for hyperparameter tuning
            gbr = GradientBoostingRegressor(random_state=42)
            random_search = RandomizedSearchCV(
                estimator=gbr,
                param_distributions=param_distributions,
                n_iter=10,  # Reducing iterations for speed
                cv=2,
                scoring='neg_mean_squared_error',
                n_jobs=-1,
                verbose=0
            )
            random_search.fit(X_train, y_train)

            # Best parameters and model
            best_params = random_search.best_params_
            best_model = random_search.best_estimator_

            # Evaluate the model on test data
            y_pred = best_model.predict(X_test)
            mse, r2, mae, mape, rmse = calculate_metrics(y_test, y_pred)

            # Store fine-tuning results
            results['fine_tuning'].append({
                'District': district,
                'Crop': crop,
                'Target Column': target_column,
                'Best Parameters': best_params,
                'MSE': mse,
                'R²': r2,
                'MAE': mae,
                'MAPE': mape,
                'RMSE': rmse
            })

            # Forecasting future years
            future_data = district_data.iloc[:len(future_years)].copy()
            future_data['Year'] = future_years
            future_X = future_data.drop(columns=['Year', 'Dist Name'] + [col for col in data.columns if col not in columns], errors='ignore')
            future_predictions = best_model.predict(future_X)

            # Store forecasted results
            for year, prediction in zip(future_years, future_predictions):
                results['forecasting'].append({
                    'District': district,
                    'Year': year,
                    'Crop': crop,
                    'Target Column': target_column,
                    'Forecasted Value': prediction
                })

    return results

# Run processing in parallel
all_results = Parallel(n_jobs=-1)(delayed(process_district)(district) for district in districts)

# Combine results
fine_tuning_results = [item for result in all_results for item in result['fine_tuning']]
forecasted_results = [item for result in all_results for item in result['forecasting']]

# Save fine-tuning metrics to CSV
metrics_df = pd.DataFrame(fine_tuning_results)
metrics_csv_path = '/content/fine_tuning_results.csv'
metrics_df.to_csv(metrics_csv_path, index=False)
print(f"Fine-tuning results saved to {metrics_csv_path}")

# Save forecasted results to CSV
forecasted_df = pd.DataFrame(forecasted_results)
forecasted_csv_path = '/content/forecasted_future_values.csv'
forecasted_df.to_csv(forecasted_csv_path, index=False)
print(f"Forecasted values saved to {forecasted_csv_path}")


Fine-tuning results saved to /content/fine_tuning_results.csv
Forecasted values saved to /content/forecasted_future_values.csv


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn style for plots
sns.set(style="whitegrid")

# Iterate over districts and crop target columns to visualize fitting and forecasting
for district in districts:
    district_data = data[data['Dist Name'] == district]
    forecasted_district_data = forecasted_df[forecasted_df['District'] == district]

    for crop, columns in crop_columns.items():
        for target_column in columns:
            # Check if the target column exists in the district data
            if target_column not in district_data.columns:
                continue

            # Prepare the actual data
            actual_data = district_data[['Year', target_column]].dropna()

            # Prepare the forecasted data
            forecasted_data = forecasted_district_data[
                forecasted_district_data['Target Column'] == target_column
            ]

            # Plot the data
            plt.figure(figsize=(12, 6))
            plt.plot(
                actual_data['Year'], actual_data[target_column], label="Actual Values", marker='o'
            )
            plt.plot(
                forecasted_data['Year'], forecasted_data['Forecasted Value'], label="Forecasted Values", linestyle='--', marker='x'
            )

            # Add title, labels, and legend
            plt.title(f"{district} - {target_column} (Actual vs Forecasted)", fontsize=14)
            plt.xlabel("Year", fontsize=12)
            plt.ylabel(target_column, fontsize=12)
            plt.legend(loc="best", fontsize=10)
            plt.grid(True)
            plt.tight_layout()
            plt.show()


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

# Load the dataset
file_path = '/content/Major_5_District.csv'
data = pd.read_csv(file_path)
data = data.drop(columns=['TOTAL AREA (1000 ha)'])  # Adjust as per actual column name

# Future years to forecast
future_years = list(range(2018, 2026)) + [2030, 2035, 2040, 2045]

# Districts to process
districts = ['Akola', 'Kolhapur', 'Pune', 'Ratnagiri', 'Wardha']
target_columns = data.columns.drop(['Year', 'Dist Name'])  # Exclude non-target columns

# Hyperparameter tuning grid
param_distributions = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 3],
    'subsample': [0.8, 1.0]
}

# Function to process each district and target column
def process_district(district):
    results = {'fine_tuning': [], 'forecasting': []}
    district_data = data[data['Dist Name'] == district]

    for target_column in target_columns:
        # Prepare features (X) and target (y)
        X = district_data.drop(columns=['Year', 'Dist Name', target_column], errors='ignore')
        y = district_data[target_column]

        # Skip columns with insufficient data
        if len(y) < 5:
            continue

        # Train on historical data
        X_train = X
        y_train = y

        # Randomized search for hyperparameter tuning
        gbr = GradientBoostingRegressor(random_state=42)
        random_search = RandomizedSearchCV(
            estimator=gbr,
            param_distributions=param_distributions,
            n_iter=10,  # Reducing iterations for speed
            cv=2,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )
        random_search.fit(X_train, y_train)

        # Best parameters and model
        best_params = random_search.best_params_
        best_model = random_search.best_estimator_

        # Forecasting future years
        future_data = district_data.iloc[:len(future_years)].copy()
        future_data['Year'] = future_years
        future_X = future_data.drop(columns=['Year', 'Dist Name', target_column], errors='ignore')
        future_predictions = best_model.predict(future_X)

        # Store fine-tuning results
        mse = mean_squared_error(y_train, best_model.predict(X_train))
        r2 = r2_score(y_train, best_model.predict(X_train))
        results['fine_tuning'].append({
            'District': district,
            'Target Column': target_column,
            'Best Parameters': best_params,
            'MSE': mse,
            'R²': r2
        })

        # Store forecasted results
        for year, prediction in zip(future_years, future_predictions):
            results['forecasting'].append({
                'District': district,
                'Year': year,
                'Target Column': target_column,
                'Forecasted Value': prediction
            })

    return results

# Run processing in parallel
all_results = Parallel(n_jobs=-1)(delayed(process_district)(district) for district in districts)

# Combine results
fine_tuning_results = [item for result in all_results for item in result['fine_tuning']]
forecasted_results = [item for result in all_results for item in result['forecasting']]

# Save fine-tuning metrics to CSV
metrics_df = pd.DataFrame(fine_tuning_results)
metrics_csv_path = '/content/fine_tuning_results.csv'
metrics_df.to_csv(metrics_csv_path, index=False)
print(f"Fine-tuning results saved to {metrics_csv_path}")

# Save forecasted results to CSV
forecasted_df = pd.DataFrame(forecasted_results)
forecasted_csv_path = '/content/forecasted_future_values.csv'
forecasted_df.to_csv(forecasted_csv_path, index=False)
print(f"Forecasted values saved to {forecasted_csv_path}")


In [None]:
# Initialize lists for aggregate metrics and forecast summaries
aggregate_metrics = []
forecast_summaries = []

# Aggregate fine-tuning results by district
for district in metrics_df['District'].unique():
    district_metrics = metrics_df[metrics_df['District'] == district]

    # Calculate average MSE and R² for the district
    avg_mse = district_metrics['MSE'].mean()
    avg_r2 = district_metrics['R²'].mean()

    # Append aggregate metrics for the district
    aggregate_metrics.append({
        'District': district,
        'Avg MSE': avg_mse,
        'Avg R²': avg_r2
    })

# Aggregate forecasted results by district and year
for district in forecasted_df['District'].unique():
    district_forecasts = forecasted_df[forecasted_df['District'] == district]

    for year in district_forecasts['Year'].unique():
        year_forecasts = district_forecasts[district_forecasts['Year'] == year]

        # Calculate mean forecasted value for the year
        avg_forecast = year_forecasts['Forecasted Value'].mean()

        # Append forecast summary for the district and year
        forecast_summaries.append({
            'District': district,
            'Year': year,
            'Avg Forecasted Value': avg_forecast
        })

# Convert aggregate metrics and forecast summaries to DataFrames
aggregate_metrics_df = pd.DataFrame(aggregate_metrics)
forecast_summary_df = pd.DataFrame(forecast_summaries)

# Save aggregated metrics to CSV
aggregate_metrics_csv_path = '/content/aggregate_fine_tuning_metrics.csv'
aggregate_metrics_df.to_csv(aggregate_metrics_csv_path, index=False)
print(f"Aggregate fine-tuning metrics saved to {aggregate_metrics_csv_path}")

# Save forecast summaries to CSV
forecast_summary_csv_path = '/content/forecast_summary.csv'
forecast_summary_df.to_csv(forecast_summary_csv_path, index=False)
print(f"Forecast summaries saved to {forecast_summary_csv_path}")

# Display the DataFrames
print("\nAggregate Fine-Tuning Metrics:")
print(aggregate_metrics_df)

print("\nForecast Summary:")
print(forecast_summary_df)


In [None]:
# Calculate and print aggregate metrics for each district
aggregate_metrics = []

for district in districts:
    district_results = [result for result in fine_tuning_results if result['District'] == district]

    if district_results:
        avg_r2 = np.mean([res['R²'] for res in district_results])
        avg_mse = np.mean([res['MSE'] for res in district_results])
        count = len(district_results)

        aggregate_metrics.append({
            'District': district,
            'Average R²': avg_r2,
            'Average MSE': avg_mse,
            'Models Tuned': count
        })

        print(f"District: {district}")
        print(f"  Average R²: {avg_r2:.4f}")
        print(f"  Average MSE: {avg_mse:.4f}")
        print(f"  Models Tuned: {count}")
        print("-" * 40)

# Convert aggregate metrics to DataFrame for further use if needed
aggregate_metrics_df = pd.DataFrame(aggregate_metrics)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn style for plots
sns.set(style="whitegrid")

# Iterate over districts and target columns to visualize fitting and forecasting
for district in districts:
    district_data = data[data['Dist Name'] == district]
    forecasted_district_data = forecasted_df[forecasted_df['District'] == district]

    for target_column in target_columns:
        # Prepare the actual data
        if target_column not in district_data.columns:
            continue
        actual_data = district_data[['Year', target_column]].dropna()

        # Prepare the forecasted data
        forecasted_data = forecasted_district_data[forecasted_district_data['Target Column'] == target_column]

        # Plot the data
        plt.figure(figsize=(12, 6))
        plt.plot(
            actual_data['Year'], actual_data[target_column], label="Actual Values", marker='o'
        )
        plt.plot(
            forecasted_data['Year'], forecasted_data['Forecasted Value'], label="Forecasted Values", linestyle='--', marker='x'
        )

        # Add title, labels, and legend
        plt.title(f"{district} - {target_column} (Actual vs Forecasted)", fontsize=14)
        plt.xlabel("Year", fontsize=12)
        plt.ylabel(target_column, fontsize=12)
        plt.legend(loc="best", fontsize=10)
        plt.grid(True)
        plt.tight_layout()
        plt.show()
