In [1]:
!pip install -q google-generativeai

In [3]:
import google.generativeai as genai
import pandas as pd
from tqdm import tqdm

In [2]:
## Prompt construction

# You are
# Overview
# PM command
# Data is
# Instruction
# Output format
# {
#      'relate':[],
#      'non-relate':[]
# }

In [4]:
overview = """
The project involves classifying forest types using Sentinel-2 data through data science techniques.
The task is to classify forests into three types: DDF (Deciduous Dipterocarp Forest), MDF (Mixed Deciduous Forest), and DEF (Dry Evergreen Forest). The data provided is in tabular format as a .csv file, containing band1 - band12 from Sentinel-2 imagery.
Here is an example of the data:
data = {
    'id': [2002, 3212, 13312],
    'b1': [293, 197, 929],
    'b11': [1927, 1598, 1975],
    'b12': [1038, 697, 1031],
    'b2': [278, 201, 982],
    'b3': [475, 347, 1020],
    'b4': [453, 228, 856],
    'b5': [987, 682, 1220],
    'b6': [1773, 1982, 2051],
    'b7': [2184, 2449, 2421],
    'b8': [1900, 2254, 2392],
    'b8_a': [2343, 2685, 2671],
    'b9': [3039, 2690, 2683],
    'nforest_type': ['MDF', 'DDF', 'MDF']
}
The current issue is There are various features such as NDVI, EVI, and others. You need to instruct your team to verify whether these nearly 300 indices should be included in the feature engineering process."""

Pmcommand = """
**Goals:**
- Classify forest types (DDF, MDF, DEF) using Sentinel-2 data and data science techniques.
- Identify relevant features (including external indices) for forest type classification.
**Pain Points:**
- Limited understanding of the relevance of various spectral indices for forest type classification.
- Large number of potential features to consider, creating potential for overfitting and computational complexity.
**Warnings:**
- Overfitting due to inclusion of redundant or irrelevant features.
- Increased computational time and resource consumption due to excessive feature engineering.
"""

Instruction = """
- Classify column feature using your domain expert, data analytics skill and provided description to these category (relate, non_relate)
- class description
    - relate: The column direct or indirect relate to the task description
    - non_relate: The column not relate to the task at all
- The output classes can be imbalanced.
- The output should not have these (backticks, explanation, opinion, quotes)
Answer format {'non_relate': [..., ...], 'relate': [..., ...]}
"""

In [5]:
def Agentprompt(role,overview,Pmcommand,Instruction,data):
    gemini_prompt = """<role></role>

Overview:<overview></overview>

Please consider this below text as a job task description<PMcommand></PMcommand>
Instruction:<Instruction></Instruction>

Here is an example:
index_feature = [
    ['eir_270L', 'Internet speed rate','Interest rate'],
    ['c_sex', 'sex of human','Sex of customer'],
    ['NDVI', 'Normalized difference vegetation index', 'The normalized difference vegetation index (NDVI) is a widely-used metric for quantifying the health and density of vegetation using sensor data.'],
]

# Output format
{
    'relate': ['NDVI'],
    'norelate': ['eir_270L','c_sex']
}

index_feature =
<data></data>

# Output format

"""
    gemini_prompt = gemini_prompt.replace("<role></role>",role)
    gemini_prompt = gemini_prompt.replace("<overview></overview>",overview)
    gemini_prompt = gemini_prompt.replace("<PMcommand></PMcommand>",Pmcommand)
    gemini_prompt = gemini_prompt.replace("<Instruction></Instruction>",Instruction)
    gemini_prompt = gemini_prompt.replace("<data></data>",str(data))
    return gemini_prompt

In [6]:
genai.configure(api_key="AIzaSyAjRu475rBGujKK1gPPbx6BAbp8mg9rM2I")
model = genai.GenerativeModel('gemini-1.0-pro-latest')
config_file = genai.GenerationConfig(temperature=0,top_p=1,top_k=1)

def Bang(question,**config_file):
        response = model.generate_content(
            question,
            **config_file
        )
        return response.text
print(Bang('Hi'))

Hi there! How can I help you today?


<hr>

In [7]:
import json
def parse_json(result):
    result = result.replace("\'", "\"")
    if "`" not in result:
        return json.loads(result)

    result = result.replace("```json", "```")
    match_result = re.search("`([^`]+)`", result)
    if match_result == None:
        return {}
    return json.loads(match_result.group(1))

In [8]:
df = pd.read_csv("/content/index_with_description.csv")
df = df.drop(columns=['Nr.','Unnamed: 0','Specific Formula','Calculated', 'Info.'])
# df = df[0:7]
# result_list = df[['Abbrev.','Name','description']].values.tolist()
result_list = df[['Abbrev.','Name']].values.tolist()
# result_list

In [9]:
len(result_list)

249

In [16]:
for i in range(0,249,24):
    print(i)

0
24
48
72
96
120
144
168
192
216
240


In [10]:
good_feature = []

In [None]:
0,24,6   x
25,48,6  x
49,72,6  x
73,96,6  x
97,120,6 x
121,144,6 x
145,168,6 x
169,192,6 x
193,216,6 x
217,240,6 x
241,249,6

In [103]:
def agent(role,overview,Pmcommand,Instruction,result_list):
    rnonr = []
    for i in tqdm(range(241,249,6)):
        batch = result_list[i:i+5]
        gemini_prompt = Agentprompt(role,overview,Pmcommand,Instruction,batch)
        # print(gemini_prompt)
        ans = Bang(gemini_prompt)
        # print(ans)
        rnonr.append(ans)
    return rnonr

# Agent #1

In [104]:
Antony = agent(
        'You are a geographer with over 10 years of experience in the field of Geo science.',
        overview,
        Pmcommand,
        Instruction,
        result_list = result_list)
# sleep(5)
Broly = agent(
        'You are a geoscientist who holds three doctoral degrees in geography, forestry, and engineering. You have 10 years of experience working with satellite imagery and forestry',
        overview,
        Pmcommand,
        Instruction,
        result_list = result_list)
# sleep(5)
Cayan = agent(
        'You are a research scientist with expertise in ecological modeling and remote sensing. With a Ph.D. in Environmental Science and 14 years of experience, you have focused on developing advanced algorithms for forest type classification using Sentinel-2 data.',
        overview,
        Pmcommand,
        Instruction,
        result_list = result_list)
# sleep(5)
# Etod = fn(
#         'You are a research scientist with expertise in ecological modeling and remote sensing. With a Ph.D. in Environmental Science and 14 years of experience, you have focused on developing advanced algorithms for forest type classification using Sentinel-2 data.',
#         overview,
#         Pmcommand,
#         Instruction,
#         result_list = result_list)

100%|██████████| 2/2 [00:07<00:00,  3.97s/it]
100%|██████████| 2/2 [00:05<00:00,  2.70s/it]
100%|██████████| 2/2 [00:05<00:00,  2.54s/it]


In [105]:
for i in Antony:
    x = parse_json(i)
    good_feature = good_feature + x['relate']

In [66]:
Antony

In [106]:
for j in Broly:
    y = parse_json(j)
    good_feature = good_feature + y['relate']

In [55]:
Broly

["{\n    'relate': ['GRNDVI', 'IVI', 'IPVI'],\n    'non_relate': ['H', 'I']\n}",
 "{\n    'relate': ['IR700', 'LCI', 'LWCI', 'LogR'],\n    'non_relate': ['nan', 'Laterite']\n}",
 "{\n    'relate': [],\n    'norelate': ['MCARI/MTVI2', 'MCARI/OSAVI', 'mCRIG', 'mCRIRE', 'MVI']\n}",
 "{\n    'relate': [],\n    'non_relate': ['MNSI','MSBI', 'MYVI', 'mND680', 'mARI']\n}"]

In [107]:
for k in Cayan:
    z = parse_json(k)
    good_feature = good_feature + z['relate']

In [57]:
Cayan

["{\n    'relate': ['GRNDVI','IVI','IPVI'],\n    'non_relate': ['H','I']\n}",
 "{\n    'relate': ['IR700', 'LCI', 'LWCI', 'LogR'],\n    'non_relate': ['nan']\n}",
 "{\n    'relate': ['MCARI/MTVI2', 'MCARI/OSAVI', 'mCRIG', 'mCRIRE', 'MVI'],\n    'non_relate': []\n}",
 "{\n    'relate': ['MNSI', 'MSBI', 'MYVI', 'mND680', 'mARI'],\n    'non_relate': []\n}"]

In [108]:
len(good_feature)

501

In [40]:
from collections import Counter


In [109]:
# Count the occurrences of each unique item
counted_items = Counter(good_feature)

# Display the result
# print(counted_items)
counted_items_dict = dict(counted_items)
print(counted_items_dict)

{'ATSAVI': 3, 'AFRI1600': 3, 'AFRI2100': 3, 'ARI': 3, 'NDVI': 9, 'ARVI': 3, 'ARVI2': 3, 'BWDRVI': 3, 'BRI': 3, 'CCCI': 3, 'CARI2': 3, 'Chlgreen': 3, 'CIgreen': 3, 'CIrededge': 3, 'Chlred-edge': 3, 'CI': 3, 'CTVI': 3, 'CRI550': 2, 'CRI700': 2, 'Datt1': 1, 'GDVI': 2, 'EVI': 4, 'EVI2': 6, 'EPI': 3, 'GEMI': 3, 'GVMI': 3, 'GARI': 3, 'GLI': 3, 'GNDVI': 6, 'GOSAVI': 3, 'GSAVI': 3, 'D678/500': 1, 'D800/550': 1, 'D800/680': 1, 'D833/658': 1, 'GRNDVI': 3, 'IVI': 3, 'IPVI': 3, 'LCI': 3, 'LWCI': 3, 'MCARI/MTVI2': 2, 'MCARI/OSAVI': 2, 'mCRIG': 2, 'mCRIRE': 2, 'MVI': 2, 'IR700': 2, 'LogR': 2, 'MNSI': 1, 'MSBI': 1, 'MYVI': 1, 'mND680': 1, 'mARI': 1, 'MCARI1': 4, 'MCARI2': 4, 'mNDVI': 4, 'mSR': 4, 'MSR670': 4, 'MSAVI': 4, 'MSAVIhyper': 4, 'MTVI1': 4, 'MTVI2': 4, 'NLI': 4, 'Norm NIR': 4, 'Norm R': 4, 'PPR': 4, 'PVR': 4, 'ND774/677': 4, 'ND782/666': 3, 'ND790/670': 3, 'ND800/2170': 3, 'PSNDc2': 3, 'PSNDc1': 3, 'PSNDb1': 3, 'PSNDa1': 3, 'ND800/680': 3, 'NDII': 3, 'NDII2': 3, 'ND827/668': 3, 'ND833/1649':

In [None]:
{'ATSAVI': 3,
'AFRI1600': 3,
'AFRI2100': 3,
'ARI': 3,
'NDVI': 9,
'ARVI': 3,
'ARVI2': 3,
'BWDRVI': 3,
'BRI': 3,
'CCCI': 3,
'CARI2': 3,
'Chlgreen': 3,
'CIgreen': 3,
'CIrededge': 3,
'Chlred-edge': 3,
'CI': 3,
'CTVI': 3,
'CRI550': 2,
'CRI700': 2,
'Datt1': 1,
'GDVI': 2,
'EVI': 4,
'EVI2': 6,
'EPI': 3,
'GEMI': 3,
'GVMI': 3,
'GARI': 3,
'GLI': 3,
'GNDVI': 6,
'GOSAVI': 3,
'GSAVI': 3,
'D678/500': 1,
'D800/550': 1,
'D800/680': 1,
'D833/658': 1,
'GRNDVI': 3,
'IVI': 3,
'IPVI': 3,
'LCI': 3,
'LWCI': 3,
'MCARI/MTVI2': 2,
'MCARI/OSAVI': 2,
'mCRIG': 2,
'mCRIRE': 2,
'MVI': 2,
'IR700': 2,
'LogR': 2,
'MNSI': 1,
'MSBI': 1,
'MYVI': 1,
'mND680': 1,
'mARI': 1,
'MCARI1': 4,
'MCARI2': 4,
'mNDVI': 4,
'mSR': 4,
'MSR670': 4,
'MSAVI': 4,
'MSAVIhyper': 4,
'MTVI1': 4,
'MTVI2': 4,
'NLI': 4,
'Norm NIR': 4,
'Norm R': 4,
'PPR': 4,
'PVR': 4,
'ND774/677': 4,
'ND782/666': 3,
'ND790/670': 3,
'ND800/2170': 3,
'PSNDc2': 3,
'PSNDc1': 3,
'PSNDb1': 3,
'PSNDa1': 3,
'ND800/680': 3,
'NDII': 3,
'NDII2': 3,
'ND827/668': 3,
'ND833/1649': 3,
'ND833/658': 3,
'SIWSI': 3,
'ND895/675': 3,
'BNDVI': 3,
'MNDVI': 3,
'NBR': 3,
'RI': 3,
'NDSI': 3,
'NDVI690-710': 3,
'NDVIc': 3,
'PNDVI': 3,
'PVI': 3,
'RARSa1': 3,
'RARSa2': 3,
'RARSa3': 3,
'RARSc3': 3,
'RARSc4': 3,
'RDVI': 5,
'RDVI2': 3,
'Rededge1': 3,
'RBNDVI': 3,
'REIP1': 3,
'REIP2': 3,
'REIP3': 3,
'REP': 3,
'Rre': 2,
'SAVImir': 2,
'IF': 2,
'MSI2': 2,
'TM5/TM7': 3,
'SR440/740': 3,
'BGI': 3,
'SR520/670': 3,
'SR550/670': 3,
'SR550/800': 3,
'GI': 3,
'SR560/658': 3,
'SR672/550': 3,
'SR672/708': 3,
'SR675/555': 3,
'SR675/700': 3,
'SR675/705': 3,
'SR700': 3,
'SR700/670': 3,
'SR735/710': 3,
'SR774/677': 3,
'SR800/2170': 3,
'PSSRc2': 3,
'PSSRc1': 3,
'PSSRb1': 3,
'RVI': 3,
'PSSRa1': 3,
'SR800/680': 3,
'PBI': 2,
'SR833/1649': 2,
'SR833/658': 2,
'Datt2': 2,
'SR860/550': 2,
'RDI': 3,
'SRMIR/Red': 3,
'SRNir/700-715': 2,
'GRVI': 3,
'SRNIR/MIR': 3,
'RRI1': 3,
'SRRed/NIR': 3,
'SRSWIRI/NIR': 2,
'SR801/550': 2,
'SRNIR/700-715': 1,
'SB885': 3,
'SAVI': 3,
'SARVI': 3,
'SARVI2': 3,
'SAVI3': 3,
'SAVI2': 3,
'SLAVI': 3,
'SQRT(IR/R)': 2,
'SIPI1': 2,
'SBI': 3,
'GVIMSS': 3,
'NSIMSS': 3,
'SBIMSS': 3,
'GVI': 3,
'YVIMSS': 3,
'TCARI/OSAVI': 3,
'TCARI': 3,
'TNDVI': 3,
'TSAVI': 3,
'TVI': 3,
'TCI': 3,
'VARI700': 3,
'WDVI': 3,
'WDRVI': 3,
'VI700': 2,
'VARIgreen': 2}

In [111]:
selected_index = [i for i,j in counted_items_dict.items() if j>=2]
selected_index

['ATSAVI',
 'AFRI1600',
 'AFRI2100',
 'ARI',
 'NDVI',
 'ARVI',
 'ARVI2',
 'BWDRVI',
 'BRI',
 'CCCI',
 'CARI2',
 'Chlgreen',
 'CIgreen',
 'CIrededge',
 'Chlred-edge',
 'CI',
 'CTVI',
 'CRI550',
 'CRI700',
 'GDVI',
 'EVI',
 'EVI2',
 'EPI',
 'GEMI',
 'GVMI',
 'GARI',
 'GLI',
 'GNDVI',
 'GOSAVI',
 'GSAVI',
 'GRNDVI',
 'IVI',
 'IPVI',
 'LCI',
 'LWCI',
 'MCARI/MTVI2',
 'MCARI/OSAVI',
 'mCRIG',
 'mCRIRE',
 'MVI',
 'IR700',
 'LogR',
 'MCARI1',
 'MCARI2',
 'mNDVI',
 'mSR',
 'MSR670',
 'MSAVI',
 'MSAVIhyper',
 'MTVI1',
 'MTVI2',
 'NLI',
 'Norm NIR',
 'Norm R',
 'PPR',
 'PVR',
 'ND774/677',
 'ND782/666',
 'ND790/670',
 'ND800/2170',
 'PSNDc2',
 'PSNDc1',
 'PSNDb1',
 'PSNDa1',
 'ND800/680',
 'NDII',
 'NDII2',
 'ND827/668',
 'ND833/1649',
 'ND833/658',
 'SIWSI',
 'ND895/675',
 'BNDVI',
 'MNDVI',
 'NBR',
 'RI',
 'NDSI',
 'NDVI690-710',
 'NDVIc',
 'PNDVI',
 'PVI',
 'RARSa1',
 'RARSa2',
 'RARSa3',
 'RARSc3',
 'RARSc4',
 'RDVI',
 'RDVI2',
 'Rededge1',
 'RBNDVI',
 'REIP1',
 'REIP2',
 'REIP3',
 'REP',
 '

In [112]:
len(selected_index)

162