In [1]:
!pip install -U datagovindia fuzzywuzzy gradio


Collecting datagovindia
  Using cached datagovindia-1.0.2-py3-none-any.whl.metadata (15 kB)
Collecting gradio
  Downloading gradio-5.49.1-py3-none-any.whl.metadata (16 kB)
Collecting gradio-client==1.13.3 (from gradio)
  Downloading gradio_client-1.13.3-py3-none-any.whl.metadata (7.1 kB)
Collecting websockets<16.0,>=13.0 (from gradio-client==1.13.3->gradio)
  Downloading websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached datagovindia-1.0.2-py3-none-any.whl (15 kB)
Downloading gradio-5.49.1-py3-none-any.whl (63.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gradio_client-1.13.3-py3-none-any.whl (325 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinu

In [19]:
# Imports
import pandas as pd
from fuzzywuzzy import process
import re
import gradio as gr
from urllib.parse import urljoin
from getpass import getpass

# Get API key
DATA_GOV_API_KEY = getpass("Enter your data.gov.in API key: ")

# Function to show dataset sample
def show_api_info(api_index):
    try:
        print(f"Attempting to fetch a small sample for resource ID: {api_index}")
        sample_df = pd.DataFrame(dg.get_resource_data(resource_id=api_index))  # fetch 100% or you can limit manually
        sample_df = sample_df.head(5)  # sample first 5 rows
        print(f"Successfully fetched {len(sample_df)} records as a sample.")
        return f"Fetched a sample of data for resource ID {api_index}. Columns: {list(sample_df.columns) if not sample_df.empty else 'No data/columns found in sample.'}"
    except Exception as e:
        return f"Could not retrieve API info for resource ID {api_index}. Error: {e}"


Enter your data.gov.in API key: ··········


In [None]:
import pandas as pd

# Your API CSV URL
url = "https://api.data.gov.in/resource/35be999b-0208-4354-b557-f6ca9a5355de?api-key=579b464db4937754135ec2dcd47eb&format=csv&filters%5Bstate_name%5D=Maharashtra"

# Load into DataFrame
df = pd.read_csv(url)
df.columns = [c.lower().replace(" ", "_") for c in df.columns]
df.head()


Unnamed: 0,state_name,district_name,crop_year,season,crop,area,production
0,Maharashtra,AHMEDNAGAR,1997,Autumn,Maize,1,1113
1,Maharashtra,AHMEDNAGAR,1997,Kharif,Arhar/Tur,17600,6300
2,Maharashtra,AHMEDNAGAR,1997,Kharif,Bajra,274100,152800
3,Maharashtra,AHMEDNAGAR,1997,Kharif,Gram,40800,18600
4,Maharashtra,AHMEDNAGAR,1997,Kharif,Jowar,900,1100


In [None]:
import pandas as pd

# Rainfall data URL
rainfall_url = "https://api.data.gov.in/resource/6c05cd1b-ed59-40c2-bc31-e314f39c6971?api-key=579b4642b4937754135ec2dcd47eb&format=csv"
rainfall_df = pd.read_csv(rainfall_url)
rainfall_df.columns = [c.lower().replace(" ", "_") for c in rainfall_df.columns]
rainfall_df.head()


Unnamed: 0,state,district,date,year,month,avg_rainfall,agency_name
0,Assam,Marigaon,2018-11-26,2018,11,0.0,NRSC VIC MODEL
1,Assam,Marigaon,2018-12-06,2018,12,0.0,NRSC VIC MODEL
2,Assam,Marigaon,2018-12-08,2018,12,0.0,NRSC VIC MODEL
3,Assam,Marigaon,2018-12-15,2018,12,0.0,NRSC VIC MODEL
4,Assam,Marigaon,2018-12-16,2018,12,0.0,NRSC VIC MODEL


In [56]:

def answer_query(query):
    query_lower = query.lower()

    # Crop queries
    if "top" in query_lower and "crop" in query_lower:
        top_n = int(re.search(r'top (\d+)', query_lower).group(1)) if re.search(r'top (\d+)', query_lower) else 3
        year_match = re.search(r'(\d{4})', query_lower)
        year = int(year_match.group(1)) if year_match else df['crop_year'].max()
        state_match = [s for s in df['state_name'].unique() if s.lower() in query_lower]
        if not state_match: return "State not recognized in crop dataset."
        state = state_match[0]
        filtered = df[(df['crop_year']==year) & (df['state_name']==state)]
        top_crops = filtered.groupby('crop')['production'].sum().sort_values(ascending=False).head(top_n)
        return f"Top {top_n} crops in {state} in {year}:\n" + top_crops.to_string()

    # Rainfall queries
    elif "rainfall" in query_lower:
        state_match = [s for s in rainfall_df['state'].unique() if s.lower() in query_lower]
        if not state_match: return "State not recognized in rainfall dataset."
        state = state_match[0]
        avg_rain = rainfall_df[rainfall_df['state']==state]['avg_rainfall'].mean()
        return f"Average annual rainfall in {state} is {avg_rain:.2f} mm."

    else:
        return "Sorry, I could not understand the query. Try the sample templates."


In [48]:
# Function to search for datasets
def search_datasets(query, results=5):
    return dg.search_api(query, results=results)

# Function to fetch data from a dataset
def fetch_data(resource_id):
    return dg.get_resource_data(resource_id)

# Function to clean and process dataset
def clean_data(df):
    # Implement necessary data cleaning steps
    return df

# Function to analyze data
def analyze_data(df, analysis_type):
    # Implement analysis logic based on analysis_type
    return analysis_results

# Function to generate a response
def generate_response(query):
    datasets = search_datasets(query)
    # Process datasets and generate a response
    return response


In [49]:
def generate_response(query):
    query_lower = query.lower()

    # 1️⃣ Compare average annual rainfall
    if "compare the average annual rainfall" in query_lower:
        m = re.search(r'compare the average annual rainfall in (\w+) and (\w+) for the last (\d+)', query_lower)
        if m:
            state_x = m.group(1).title()
            state_y = m.group(2).title()
            n_years = int(m.group(3))

            years_sorted = sorted(rain_df['year'].dropna().unique())
            last_years = years_sorted[-n_years:]

            avg_x = rain_df[(rain_df['state'].str.title()==state_x) & (rain_df['year'].isin(last_years))]['rainfall_mm'].mean()
            avg_y = rain_df[(rain_df['state'].str.title()==state_y) & (rain_df['year'].isin(last_years))]['rainfall_mm'].mean()

            return f"Average rainfall in {state_x} for last {n_years} years: {avg_x:.2f} mm\n" \
                   f"Average rainfall in {state_y} for last {n_years} years: {avg_y:.2f} mm"

    # 2️⃣ Highest and lowest production districts
    elif "district in" in query_lower and "highest production of" in query_lower:
        # Extract crop and states
        m = re.search(r'district in (\w+) with the highest production of (\w+).*district.*lowest production.*(\w+)', query_lower)
        if m:
            state_high = m.group(1).title()
            crop = m.group(2).title()
            state_low = m.group(3).title()

            high_row = crop_df[(crop_df['state'].str.title()==state_high) & (crop_df['crop'].str.title()==crop)].sort_values('production', ascending=False).iloc[0]
            low_row = crop_df[(crop_df['state'].str.title()==state_low) & (crop_df['crop'].str.title()==crop)].sort_values('production', ascending=True).iloc[0]

            return f"Highest production of {crop} in {state_high}: {high_row['district']} ({high_row['production']})\n" \
                   f"Lowest production of {crop} in {state_low}: {low_row['district']} ({low_row['production']})"

    # 3️⃣ Production trend correlation
    elif "analyze the production trend" in query_lower:
        return "Trend analysis not yet implemented. You can expand here using crop_df and rain_df."

    # 4️⃣ Policy recommendation
    elif "policy advisor" in query_lower:
        return "Policy recommendation not yet implemented. You can expand here using historical crop and rainfall data."

    else:
        return "Sorry, I could not understand the query. Try the sample templates."


In [59]:
import gradio as gr

iface = gr.Interface(
    fn=answer_query,
    inputs="text",
    outputs="text",
    title="Project Samarth: Crop & Climate Q&A Prototype",
    description="Type a query like 'Top 3 crops in Maharashtra in 1997' or 'Average rainfall in Assam'."
)

iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7a6a0325de2e07b1fb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


