In [None]:
# Step 1: Install and import necessary libraries
!pip install sentence-transformers
!pip install chromadb
!pip install transformers
!pip install openai

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import openai
from google.colab import drive



In [None]:
!pip install openai==0.28



In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Load and preprocess the dataset
file_path = '/content/drive/MyDrive/Crop_Recommendation.csv'
crop_data = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
crop_data.head()

Unnamed: 0,Nitrogen,Phosphorus,Potassium,Temperature,Humidity,pH_Value,Rainfall,Crop
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,Rice


In [None]:
# Convert the column data into text descriptive format
def generate_text(row):
    text = f"With nitrogen {row['Nitrogen']} mg/kg, phosphorus {row['Phosphorus']} mg/kg, potassium {row['Potassium']} mg/kg, pH {row['pH_Value']}, temperature {row['Temperature']}°C, humidity {row['Humidity']}%, and rainfall {row['Rainfall']} mm, the recommended crop is {row['Crop']}."
    return text

In [None]:
crop_data['description'] = crop_data.apply(generate_text, axis=1)

In [None]:
# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
crop_data['embeddings'] = crop_data['description'].apply(lambda x: model.encode(x).tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Step 3: Setting up ChromaDB
client = chromadb.Client(Settings())
collection_name = "crop_recommendations"

if collection_name in client.list_collections():
    collection = client.get_collection(collection_name)
else:
    collection = client.create_collection(collection_name)

# Add unique IDs to the DataFrame (if not already added)
if 'id' not in crop_data.columns:
    crop_data['id'] = crop_data.index.astype(str)

# Add embeddings and descriptions to ChromaDB (if not already added)
if collection.count() == 0:
    embeddings = list(crop_data['embeddings'])
    metadatas = [{'description': desc} for desc in crop_data['description']]
    ids = list(crop_data['id'])
    collection.add(embeddings=embeddings, metadatas=metadatas, ids=ids)



print(f"Added {len(crop_data)} items to ChromaDB.")

Added 2200 items to ChromaDB.


In [None]:
# Initialize cache
cache = {}

In [None]:
#def search function with cache
def search_with_cache(query):
    if query in cache:
        return cache[query]
    else:
        query_embedding = model.encode(query).tolist()
        results = collection.query(query_embeddings=[query_embedding], n_results=5)
        cache[query] = results
        return results


In [None]:
#define and process example queries
queries = [
    "Best crops for high nitrogen and moderate temperature.",
    "Recommendations for high-yield crops with high rainfall.",
    "Suitable crops for low pH and high potassium."
]

for query in queries:
    results = search_with_cache(query)
    print(f"Query: {query}")
    for metadata in results['metadatas'][0]:
        print(metadata['description'])


Query: Best crops for high nitrogen and moderate temperature.
With nitrogen 68 mg/kg, phosphorus 41 mg/kg, potassium 16 mg/kg, pH 6.158830619, temperature 21.77689322°C, humidity 57.80840636%, and rainfall 102.0861694 mm, the recommended crop is Maize.
With nitrogen 83 mg/kg, phosphorus 45 mg/kg, potassium 21 mg/kg, pH 5.716222912, temperature 18.83344471°C, humidity 58.75082029%, and rainfall 79.7532896 mm, the recommended crop is Maize.
With nitrogen 88 mg/kg, phosphorus 38 mg/kg, potassium 15 mg/kg, pH 6.455116637, temperature 25.08239719°C, humidity 65.92195844%, and rainfall 62.49190812 mm, the recommended crop is Maize.
With nitrogen 78 mg/kg, phosphorus 48 mg/kg, potassium 22 mg/kg, pH 5.588650585, temperature 23.08974909°C, humidity 63.10459626%, and rainfall 70.43473609 mm, the recommended crop is Maize.
With nitrogen 79 mg/kg, phosphorus 59 mg/kg, potassium 17 mg/kg, pH 6.644205485, temperature 20.37999665°C, humidity 63.73849998%, and rainfall 108.5054416 mm, the recommended

In [None]:
# Step 4: Define the search function
def search(query):
    query_embedding = model.encode([query])
    results = collection.query(query_embeddings=query_embedding, n_results=5)
    return results



In [None]:
# Step 5: Define the re-ranking function
reranker_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
reranker = AutoModelForSequenceClassification.from_pretrained(reranker_model_name)

In [None]:
def re_rank(query, documents):
    # Combine query and documents into pairs
    pairs = [(query, doc) for doc in documents]

    # Tokenize the pairs
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt')

    # Get model outputs
    with torch.no_grad():
        outputs = reranker(**inputs)

    # Extract scores from the outputs
    scores = outputs.logits.squeeze().tolist()

    # Handle case where there is only one document
    if isinstance(scores, float):
        scores = [scores]

    # Sort documents based on scores in descending order
    ranked_indices = np.argsort(scores)[::-1]
    ranked_results = [documents[i] for i in ranked_indices]

    return ranked_results


In [None]:
# Specify the file path
file_path = '/content/drive/MyDrive/openai_new_key.txt'

# Read the API key from the file
with open(file_path, 'r') as f:
    api_key = f.read().strip()

# Set the OpenAI API key
openai.api_key = api_key


def generate_response(query, ranked_results):
    prompt = f"Based on the following data, {query}:\n\n"
    for result in ranked_results:
        prompt += result + "\n\n"

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150
    )

    return response['choices'][0]['message']['content']

for query in queries:
    results = search_with_cache(query)
    ranked_results = re_rank(query, [metadata['description'] for metadata in results['metadatas'][0]])
    response = generate_response(query, ranked_results)
    print(f"Query: {query}")
    print(f"Generated Response: {response}")


Query: Best crops for high nitrogen and moderate temperature.
Generated Response: Based on the data provided, it appears that under conditions of high nitrogen and moderate temperature, Maize is the best crop to grow. This conclusion is derived from the fact that in every data set, even though the amounts of nitrogen, phosphorus, potassium, pH, temperature, humidity, and rainfall vary, maize is always the recommended crop.
Query: Recommendations for high-yield crops with high rainfall.
Generated Response: Based on the provided data, it's evident that high-yield watermelon crops thrive well in conditions with high rainfall. The optimal conditions for this crop, within the range of the data provided, include:

1. Nitrogen Levels: A range of 82 to 113 mg/kg.
2. Phosphorus Levels: A range of 16 to 30 mg/kg.
3. Potassium Levels: A range of 45 to 54 mg/kg.
4. pH Levels: Slightly acidic with a range of approximately 6.2 to 6.7.
5. Temperature: Approximately 24 - 27°C.
6. Humidity: High humidi