Cell 1: Load Libraries

In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Cell 2: Load Datasets and Inspect Structure

In [56]:
crop_df = pd.read_csv("datasets/crop_production.csv")
temp_df = pd.read_csv("datasets/GlobalLandTemperaturesByCountry.csv")
agri_df = pd.read_csv("datasets/agriculture_dataset.csv")

print("üì¶ crop_df columns:", crop_df.columns.tolist())
print("üå°Ô∏è temp_df columns:", temp_df.columns.tolist())
print("üöú agri_df columns:", agri_df.columns.tolist())

üì¶ crop_df columns: ['index', 'LOCATION', 'INDICATOR', 'SUBJECT', 'MEASURE', 'FREQUENCY', 'TIME', 'Value', 'Flag Codes']
üå°Ô∏è temp_df columns: ['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'Country']
üöú agri_df columns: ['Farm_ID', 'Crop_Type', 'Farm_Area(acres)', 'Irrigation_Type', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Yield(tons)', 'Soil_Type', 'Season', 'Water_Usage(cubic meters)']


In [None]:
Cell 3: Clean and Standardize Data

In [57]:
crop_df = crop_df.rename(columns={
    'LOCATION': 'Country', 'SUBJECT': 'Crop',
    'TIME': 'Year', 'Value': 'MetricValue'
})
crop_df['Year'] = pd.to_numeric(crop_df['Year'], errors='coerce')
crop_clean = crop_df[['Country', 'Year', 'Crop', 'MetricValue']].dropna()

temp_df['Year'] = pd.to_datetime(temp_df['dt'], errors='coerce').dt.year
temp_clean = temp_df[['Country', 'Year', 'AverageTemperature']].dropna()

print("üåø crop_clean sample:\n", crop_clean.head())
print("üå°Ô∏è temp_clean sample:\n", temp_clean.head())

üåø crop_clean sample:
   Country  Year  Crop  MetricValue
0     AUS  1990  RICE     8.314607
1     AUS  1991  RICE     8.394737
2     AUS  1992  RICE     8.094340
3     AUS  1993  RICE     8.336000
4     AUS  1994  RICE     8.537815
üå°Ô∏è temp_clean sample:
   Country  Year  AverageTemperature
0   √Öland  1743               4.384
5   √Öland  1744               1.530
6   √Öland  1744               6.702
7   √Öland  1744              11.609
8   √Öland  1744              15.342


Cell 4: Align Country Names and Merge

In [58]:
iso_map = {
    'AUS': 'Australia', 'CAN': 'Canada', 'USA': 'United States',
    'ARG': 'Argentina', 'MEX': 'Mexico', 'JPN': 'Japan',
    'KOR': 'South Korea', 'NZL': 'New Zealand', 'TUR': 'Turkey', 'DZA': 'Algeria'
}
crop_clean['Country'] = crop_clean['Country'].map(iso_map)

temp_filtered = temp_clean[
    temp_clean['Country'].isin(crop_clean['Country'].dropna()) &
    (temp_clean['Year'] >= 1980)
]

merged_df = pd.merge(crop_clean, temp_filtered, on=['Country', 'Year'], how='inner')

print("üîó Merged rows:", merged_df.shape[0])
print("üîó Sample merged data:\n", merged_df.head())

üîó Merged rows: 34200
üîó Sample merged data:
      Country  Year  Crop  MetricValue  AverageTemperature
0  Australia  1990  RICE     8.314607              28.750
1  Australia  1990  RICE     8.314607              27.017
2  Australia  1990  RICE     8.314607              26.935
3  Australia  1990  RICE     8.314607              22.275
4  Australia  1990  RICE     8.314607              19.250


In [None]:
Cell 5: Build AI Recommendation Logic

In [59]:
def ai_crop_recommendation(country, crop, temp, metric, size):
    crop = crop.lower()
    if temp > 28 and metric < 5:
        return f"In {country}, the climate is hot and yield is low‚Äîconsider switching to sorghum or millet."
    elif metric > 7 and crop in ['maize', 'rice']:
        return f"{crop.title()} is viable in {country} on your {size} acre farm."
    else:
        return f"In {country}, consider mixed cropping or improving soil on your {size} acre farm."

In [None]:
Cell 6: Interactive User Input

In [60]:
user_country = input("üåç Enter your country: ")
user_crop = input("üåæ Enter your crop (e.g., Maize): ")
user_size = float(input("üìê Enter your farm size in acres: "))

subset = merged_df[
    (merged_df['Country'].str.lower() == user_country.lower()) &
    (merged_df['Crop'].str.lower() == user_crop.lower())
]

if subset.empty:
    print("\n‚ö†Ô∏è No matching data found for your crop and location.")
else:
    avg_temp = subset['AverageTemperature'].mean()
    avg_metric = subset['MetricValue'].mean()
    recommendation = ai_crop_recommendation(user_country, user_crop, avg_temp, avg_metric, user_size)
    print("\nüß† SmartGuide Recommendation:\n", recommendation)


üåç Enter your country:  South Africa
üåæ Enter your crop (e.g., Maize):  Maize
üìê Enter your farm size in acres:  20



‚ö†Ô∏è No matching data found for your crop and location.


In [None]:
Cell 7: Train Yield Prediction Model

In [61]:
features = ['Farm_Area(acres)', 'Fertilizer_Used(tons)', 'Water_Usage(cubic meters)',
            'Pesticide_Used(kg)', 'Soil_Type', 'Season', 'Irrigation_Type']
target = 'Yield(tons)'

agri_clean = agri_df.dropna(subset=features + [target])
X = agri_clean[features]
y = agri_clean[target]

categorical = ['Soil_Type', 'Season', 'Irrigation_Type']
numerical = list(set(features) - set(categorical))

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical),
        ('cat', OneHotEncoder(drop='first'), categorical)
    ]
)

model_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('regression', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_pipeline.fit(X_train, y_train)
score = model_pipeline.score(X_test, y_test)

print(f"üìà Yield prediction model trained.\nR¬≤ score: {score:.2f}")

üìà Yield prediction model trained.
R¬≤ score: 0.12


In [None]:
Final cell: Proj