In [1]:
# Crop Yield Forecasting Using Python (Colab Notebook)
# ----------------------------------------------------

## 1. Setup and Libraries
!pip install pandas numpy scikit-learn matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Data Import and Preprocessing
# Load the datasets
rainfall_data = pd.read_excel('rainfall.xlsx')
crop_yield_data = pd.read_csv('crop_yield.csv')

# Merge datasets (example: merging on district and year)
data = pd.merge(rainfall_data, crop_yield_data, on=['District', 'Year'])

# Handle missing values
data.fillna(data.mean(), inplace=True)

# Normalize the data (example: scaling rainfall and temperature)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['Rainfall', 'Temperature']] = scaler.fit_transform(data[['Rainfall', 'Temperature']])

## 3. Feature Engineering
# Create additional features (example: lagged rainfall or weather indices)
data['Rainfall_Index'] = data['Rainfall'] * 0.5 + data['Temperature'] * 0.5

# Convert categorical features (if any)
data = pd.get_dummies(data, drop_first=True)

## 4. Model Development
# Split data into training and testing sets
X = data.drop(['Yield'], axis=1)
y = data['Yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train multiple models
models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Linear Regression': LinearRegression()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        'MSE': mean_squared_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }

# Ensemble Model (Voting)
from sklearn.ensemble import VotingRegressor
ensemble = VotingRegressor(estimators=[(name, model) for name, model in models.items()])
ensemble.fit(X_train, y_train)
ensemble_pred = ensemble.predict(X_test)

## 5. Model Evaluation
# Evaluate and visualize results
for name, result in results.items():
    print(f"{name} - MSE: {result['MSE']}, R2: {result['R2']}")

print(f"Ensemble - MSE: {mean_squared_error(y_test, ensemble_pred)}, R2: {r2_score(y_test, ensemble_pred)}")

# Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x=list(results.keys()) + ['Ensemble'], y=[result['R2'] for result in results.values()] + [r2_score(y_test, ensemble_pred)])
plt.title('Model R2 Scores')
plt.show()

## 6. Save Results
data.to_csv('processed_data.csv', index=False)




FileNotFoundError: [Errno 2] No such file or directory: 'rainfall.xlsx'