In [66]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [67]:

# Load datasets
kaggle = pd.read_csv("../data/Crop_recommendation.csv")
soil = pd.read_csv("../data/average_soil_nutrients_tn_districtwise.csv")
climate = pd.read_csv("../data/tn_climate_10years_seasonal.csv")

print("SOIL COLUMNS:")
print(soil.columns.tolist())

print("\nCLIMATE COLUMNS:")
print(climate.columns.tolist())



SOIL COLUMNS:
['S_No', 'District', 'EC_dS_m', 'pH', 'N', 'P', 'K', 'OC_percent', 'S_ppm', 'B_ppm', 'Zn_ppm', 'Cu_ppm', 'Fe_ppm', 'Mn_ppm']

CLIMATE COLUMNS:
['District', 'Year', 'Season', 'Temperature', 'Temperature_max', 'Temperature_min', 'Humidity', 'Rainfall']


In [68]:
# Rename for consistency
soil = soil.rename(columns={
    "N": "N",
    "P": "P",
    "K": "K",
    "pH": "pH"
})

climate = climate.rename(columns={
    "Temperature": "Temperature",
    "Rainfall": "Rainfall",
    "Humidity": "Humidity"
})

print("SOIL COLUMNS:")
print(soil.columns.tolist())

print("\nCLIMATE COLUMNS:")
print(climate.columns.tolist())


SOIL COLUMNS:
['S_No', 'District', 'EC_dS_m', 'pH', 'N', 'P', 'K', 'OC_percent', 'S_ppm', 'B_ppm', 'Zn_ppm', 'Cu_ppm', 'Fe_ppm', 'Mn_ppm']

CLIMATE COLUMNS:
['District', 'Year', 'Season', 'Temperature', 'Temperature_max', 'Temperature_min', 'Humidity', 'Rainfall']


In [69]:
# Keep only required columns
soil = soil[["District", "N", "P", "K", "pH"]]
climate = climate[["District", "Season", "Temperature", "Humidity", "Rainfall"]]

print("Soil shape:", soil.shape)
print("Climate shape:", climate.shape)
print("Kaggle shape:", kaggle.shape)

Soil shape: (29, 5)
Climate shape: (522, 5)
Kaggle shape: (2200, 8)


In [70]:

# Merge soil + climate
tn = soil.merge(climate, on="District")

features = ["N", "P", "K", "Temperature", "Humidity", "pH", "Rainfall"]

# Scale data
scaler = StandardScaler()
kaggle_scaled = scaler.fit_transform(kaggle[features])
tn_scaled = scaler.transform(tn[features])

# Similarity matrix
similarity = cosine_similarity(tn_scaled, kaggle_scaled)

# Generate labels
K = 30
labels = []

for i in range(similarity.shape[0]):
    top_k_idx = similarity[i].argsort()[-K:]
    top_crops = kaggle.iloc[top_k_idx]["label"]
    label = top_crops.mode()[0]
    labels.append(label)

tn["crop"] = labels

tn.to_csv("../data/tn_synthetic_crop_data.csv", index=False)
print("Synthetic TN dataset created")


Synthetic TN dataset created
