<a href="https://colab.research.google.com/github/kkidia/Crop-Type-classfication_Senengal_DL/blob/main/Label_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reading corner
SEN4STAT/ESA: https://www.esa-sen4stat.org/user-stories/senegal-prototype/

EOSTAT/FAO: https://data.apps.fao.org/catalog/dataset/5c377b2b-3c2e-4b70-afd7-0c80900b68bb/resource/50bc9ff5-95d2-40cd-af12-6aee2cfcc4ae

RNN: https://www.sciencedirect.com/science/article/pii/S0034425721003230#bb0310 and its GitHub : https://github.com/0zgur0/multi-stage-convSTAR-network/tree/master

https://www.sciencedirect.com/science/article/pii/S0034425724001214

https://www.sciencedirect.com/science/article/abs/pii/S2352938522001203

crop type mapping in Ghana and South Sudan: GitHubCode: https://github.com/roserustowicz/crop-type-mapping

https://github.com/tayden/geotiff-crop-dataset




In [None]:
!pip install rasterio rasterstats fiona geopandas geemap -q

In [None]:
#Authotication
import ee
import rasterio
import matplotlib.pyplot as plt
# @title Authenticate to the Earth Engine servers
ee.Authenticate()
# Initialize the Earth Engine object with Google Cloud project ID
project_id = 'ee-kkidia3' # change here
ee.Initialize(project=project_id)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import geemap
import geopandas as gpd
import numpy as np

In [None]:
#@title GEE Assets


def load_cleandata(asset_id):
    return ee.FeatureCollection(asset_id)

# Load the datasets

# clean2018 = load_cleandata('projects/ee-kkidia3/assets/clean_data_2018-2023_no-bands/clean_raw_data_2018')
# clean2019 = load_cleandata('projects/ee-kkidia3/assets/clean_data_2018-2023_no-bands/clean_raw_data_2019')
# clean2020 = load_cleandata('projects/ee-kkidia3/assets/clean_data_2018-2023_no-bands/clean_raw_data_2020')
# clean2023 = load_cleandata('projects/ee-kkidia3/assets/clean_data_2018-2023_no-bands/clean_raw_data_2023')


clean2018 = load_cleandata('projects/ee-janet/assets/crop_monitoring_class/clean_raw_data_2018')
clean2019 = load_cleandata('projects/ee-janet/assets/crop_monitoring_class/clean_raw_data_2019')
clean2020 = load_cleandata('projects/ee-janet/assets/crop_monitoring_class/clean_raw_data_2020')
clean2023 = load_cleandata('projects/ee-janet/assets/crop_monitoring_class/clean_raw_data_2023')

clean_data = clean2018.merge(clean2019).merge(clean2020).merge(clean2023)

# Dictionary mapping for easier reference
asset_clean = {
    'clean2018': clean2018,
    'clean2019': clean2019,
    'clean2020': clean2020,
    'clean2023': clean2023
}


In [None]:
# Convert the GEE FeatureCollection to a pandas DataFrame for easier handling
def fc_to_df(fc):
    # Get the list of columns you want to extract
    columns = ['ID', 'Class', 'Name', 'Sub_class', 'Year']
    rows = fc.reduceColumns(ee.Reducer.toList(len(columns)), columns).get('list').getInfo()
    df = pd.DataFrame(rows, columns=columns)
    return df

# Combine all yearly FeatureCollections into one
all_data = clean2018.merge(clean2019).merge(clean2020).merge(clean2023)

# Convert the combined data to pandas DataFrame
polygon_df = fc_to_df(all_data)

# Automatically assign numeric labels to 'Class', 'Name', and 'Sub_class'
label_encoders = {}
mapping_dicts = {}
for col in ['Class', 'Name', 'Sub_class']:
    le = LabelEncoder()
    polygon_df[col + '_label'] = le.fit_transform(polygon_df[col])
    label_encoders[col] = le  # Store the label encoders for future use
    # Create a mapping dictionary from class label to integer
    mapping_dicts[col] = {class_label: index for index, class_label in enumerate(le.classes_)}

# Now polygon_df will have the numeric columns as well
#print(polygon_df.head())
polygon_df.head()

Unnamed: 0,ID,Class,Name,Sub_class,Year,Class_label,Name_label,Sub_class_label
0,00000000000000003483,Crop,Maize,Cereals,2018,0,26,1
1,00000000000000003484,Crop,Maize,Cereals,2018,0,26,1
2,0000000000000000348f,Crop,Maize,Cereals,2018,0,26,1
3,00000000000000003490,Crop,Maize,Cereals,2018,0,26,1
4,00000000000000003492,Crop,Maize,Cereals,2018,0,26,1


In [None]:
polygon_df.columns

In [None]:
#@title export all columns
# Define the assets for each year
assets = [
    (2018, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2018'),
    (2019, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2019'),
    (2020, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2020'),
    (2023, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2023')
]

# Function to calculate vegetation indices
def calculate_indices(image):
    # Calculate NDVI
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')

    # Calculate EVI
    evi = image.expression(
        '2.5 * ((NIR - RED) / (NIR + 6 * RED - 7.5 * BLUE + 1))',
        {
            'NIR': image.select('B8'),
            'RED': image.select('B4'),
            'BLUE': image.select('B2')
        }
    ).rename('EVI')

    # Calculate SAVI
    savi = image.expression(
        '(NIR - RED) / (NIR + RED + 0.5) * (1 + 0.5)',
        {
            'NIR': image.select('B8'),
            'RED': image.select('B4')
        }
    ).rename('SAVI')

    # Combine indices into one image
    return ndvi.addBands([evi, savi]).toFloat()

# Function to rasterize multiple labels
def rasterize_labels(aoi, properties, scale=10):
    label_bands = []
    for property_name in properties:
        raster = aoi.reduceToImage(
            properties=[property_name],
            reducer=ee.Reducer.first()
        ).rename(property_name).toFloat()
        label_bands.append(raster)
    return ee.Image(label_bands).toFloat()

# Process data for each year and AOI
def process_year(year, asset_path):
    # Load the AOI from the asset
    aoi = ee.FeatureCollection(asset_path)

    # Load Sentinel-2 data for the year
    collection = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
        .filterBounds(aoi) \
        .filterDate(f'{year}-07-01', f'{year}-12-31') \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)) \
        .map(calculate_indices) \
        .median()  # Composite to get a single image

    # Select only vegetation indices
    indices_image = collection.select(['NDVI', 'EVI', 'SAVI'])

    # Rasterize specified labels
    label_properties = [
       # 'ID', 'Class', 'Name', 'Sub_class',
        'Year', 'Class_label', 'Name_label', 'Sub_class_label'
    ]
    labels_raster = rasterize_labels(aoi, label_properties)

    # Combine indices with labels
    combined_image = indices_image.addBands(labels_raster)

    # Clip to AOI
    clipped_image = combined_image.clip(aoi)

    # Export the raster
    export_task = ee.batch.Export.image.toDrive(
        image=clipped_image,
        description=f'Vegetation_Indices_and_Labels_{year}',
        folder='GEE_Exports',
        fileNamePrefix=f'Veg_Indices_Labels_{year}',
        scale=10,
        region=aoi.geometry(),
        maxPixels=1e13
    )
    export_task.start()
    print(f"Export task started for year {year}")

# Run the process for each asset
for year, asset_path in assets:
    process_year(year, asset_path)

print("All export tasks initiated. Check your Google Drive for the results.")


Export task started for year 2018
Export task started for year 2019
Export task started for year 2020
Export task started for year 2023
All export tasks initiated. Check your Google Drive for the results.


In [None]:
# Function to apply label encoding in GEE
def apply_label_encoding(fc, mapping_dicts):
    # Convert Python dictionaries to ee.Dictionary
    ee_mapping_dicts = {col: ee.Dictionary(mapping_dicts[col]) for col in ['Class', 'Name', 'Sub_class']}

    def encode_feature(feature):
        for col in ['Class', 'Name', 'Sub_class']:
            value = feature.get(col)
            mapping_dict = ee_mapping_dicts[col]
            # Get the mapped integer; use -1 if value is not found
            label = mapping_dict.get(value, -1)
            feature = feature.set(col + '_label', label)
        return feature
    return fc.map(encode_feature)


In [None]:

# Function to convert the "Year" property from string to numeric
def convert_year_to_numeric(fc):
    def convert_year(feature):
        year_numeric = ee.Number.parse(feature.get('Year'))
        return feature.set('Year', year_numeric)
    return fc.map(convert_year)



In [None]:
# Function to export the data with numeric year and label-encoded fields
def export_labeled_data(fc, year, asset_path, mapping_dicts):
    # Convert the "Year" property to numeric
    labeled_fc = convert_year_to_numeric(fc)
    # Apply label encoding
    labeled_fc = apply_label_encoding(labeled_fc, mapping_dicts)
    # Export the updated feature collection
    task = ee.batch.Export.table.toAsset(
        collection=labeled_fc,
        description=f'export_{year}_labeled_data',
        assetId=asset_path
    )
    task.start()

# Export the data year by year
export_labeled_data(clean2018, 2018, 'projects/ee-kkidia3/assets/updated_label/labeled_clean_data_2018', mapping_dicts)
export_labeled_data(clean2019, 2019, 'projects/ee-kkidia3/assets/updated_label/labeled_clean_data_2019', mapping_dicts)
export_labeled_data(clean2020, 2020, 'projects/ee-kkidia3/assets/updated_label/labeled_clean_data_2020', mapping_dicts)
export_labeled_data(clean2023, 2023, 'projects/ee-kkidia3/assets/updated_label/labeled_clean_data_2023', mapping_dicts)


# Check the status of all tasks
def check_task_status():
    tasks = ee.batch.Task.list()
    for task in tasks:
        print(f"Task ID: {task.id}")
        print(f"State: {task.state}")
        print(f"Description: {task.config.get('description')}")
        print(f"Progress: {task.config.get('progress')}")
        print('-----------------------------')

# Call this function to see task status
check_task_status()



In [None]:
#@title Visualizing the exported geotiffs
# Load the GeoTIFF
file_path = '/content/drive/MyDrive/Crop Monitoring/crop_types_data/raster_data/Veg_Indices_Labels_2019.tif'  # Replace with your GeoTIFF file path
with rasterio.open(file_path) as src:
    bands = [src.read(i) for i in range(1, src.count + 1)]  # Read all bands
    labels = src.descriptions  # Band descriptions

# Plot all bands
for i, band in enumerate(bands, start=1):
    plt.figure()
    plt.title(f'Band {i}: {labels[i-1] if labels else f"Band {i}"}')
    plt.imshow(band, cmap='viridis')  # Choose a colormap suitable for your data
    plt.colorbar(label='Value')
    plt.show()


In [None]:
# Open the GeoTIFF
with rasterio.open(file_path) as src:
    # Get the transform and read all bands
    transform = src.transform
    bands = src.read()

    # Get array dimensions
    rows, cols = bands.shape[1], bands.shape[2]

    # Generate all row, col combinations
    row_indices, col_indices = np.meshgrid(np.arange(rows), np.arange(cols), indexing='ij')

    # Convert row, col to geographic coordinates
    coords = [
        rasterio.transform.xy(transform, row, col, offset='center')
        for row, col in zip(row_indices.flatten(), col_indices.flatten())
    ]

    # Flatten pixel values
    pixel_values = bands.reshape(bands.shape[0], -1).T  # (n_pixels, n_bands)

    # Combine coordinates and pixel values
    data = [{"x": x, "y": y, "values": vals.tolist()} for (x, y), vals in zip(coords, pixel_values)]

# Example: Print first 5 coordinates with values
for d in data[:5]:
    print(d)


{'x': -16.577914495019503, 'y': 14.624887235815251, 'values': [nan, nan, nan, nan, nan, nan, nan]}
{'x': -16.577824663491093, 'y': 14.624887235815251, 'values': [nan, nan, nan, nan, nan, nan, nan]}
{'x': -16.57773483196268, 'y': 14.624887235815251, 'values': [nan, nan, nan, nan, nan, nan, nan]}
{'x': -16.577645000434266, 'y': 14.624887235815251, 'values': [nan, nan, nan, nan, nan, nan, nan]}
{'x': -16.577555168905857, 'y': 14.624887235815251, 'values': [nan, nan, nan, nan, nan, nan, nan]}


In [None]:
# Open the GeoTIFF
with rasterio.open(file_path) as src:
    # Get the transform and read all bands
    transform = src.transform
    bands = src.read()

    # Get array dimensions
    rows, cols = bands.shape[1], bands.shape[2]

    # Generate all row, col combinations
    row_indices, col_indices = np.meshgrid(np.arange(rows), np.arange(cols), indexing='ij')

    # Convert row, col to geographic coordinates
    coords = [
        rasterio.transform.xy(transform, row, col, offset='center')
        for row, col in zip(row_indices.flatten(), col_indices.flatten())
    ]

    # Flatten pixel values
    pixel_values = bands.reshape(bands.shape[0], -1).T  # (n_pixels, n_bands)

    # Combine coordinates and pixel values
    data = [{"x": x, "y": y, "values": vals.tolist()} for (x, y), vals in zip(coords, pixel_values)]

# Filter valid data (exclude rows where all values are NaN)
valid_data = [d for d in data if not np.isnan(d['values']).all()]

# Separate values into columns
columns = ['NDVI', 'EVI', 'SAVI', 'Year', 'Class_label', 'Name_label', 'Sub_class_label']
rows = []
for d in valid_data:
    row = {"x": d["x"], "y": d["y"]}
    row.update({col: val for col, val in zip(columns, d["values"])})
    rows.append(row)

# Convert to a DataFrame
df = pd.DataFrame(rows)

# Save to a CSV for further analysis
df_path = '/content/drive/MyDrive/Crop Monitoring/crop_types_data/raster_data/2019_valid_geo_values.csv'
df.to_csv(df_path, index=False)
print("Valid data saved to valid_geo_values.csv")

# Display the first few rows

In [None]:
df.head()

Unnamed: 0,x,y,NDVI,EVI,SAVI,Year,Class_label,Name_label,Sub_class_label
0,-16.424572,14.624887,0.277535,0.409466,0.416268,2019.0,0.0,53.0,1.0
1,-16.424482,14.624887,0.277535,0.409466,0.416268,2019.0,0.0,53.0,1.0
2,-16.425381,14.624797,0.199847,0.290833,0.299747,2019.0,0.0,21.0,3.0
3,-16.425291,14.624797,0.218817,0.315819,0.3282,2019.0,0.0,21.0,3.0
4,-16.424662,14.624797,0.232973,0.324993,0.349433,2019.0,0.0,53.0,1.0


In [None]:
#@title Checking the number of images available between August and October
# Assets for each year
assets = [
    (2018, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2018'),
    (2019, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2019'),
    (2020, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2020'),
    (2023, 'projects/ee-janet/assets/crop_monitoring_class/labeled_clean_data_2023')
]

# Function to count images and extract acquisition dates for a specific asset and year
def check_images_and_dates_by_month(year, asset_path):
    # Load the region of interest (ROI) from the asset
    roi = ee.FeatureCollection(asset_path).geometry()

    # Filter the Sentinel-2 collection for the specified year and date range
    collection = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
        .filterBounds(roi) \
        .filterDate(f'{year}-05-01', f'{year}-10-31') \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))

    # Get the acquisition dates
    dates = collection.aggregate_array('system:time_start').getInfo()
    dates = [ee.Date(d).format('YYYY-MM-dd').getInfo() for d in dates]

    # Group dates by month
    grouped_dates = {'August': [], 'September': [], 'October': []}
    for date in dates:
        month = date.split('-')[1]  # Extract month as '08', '09', etc.
        if month == '08':
            grouped_dates['August'].append(date)
        elif month == '09':
            grouped_dates['September'].append(date)
        elif month == '10':
            grouped_dates['October'].append(date)

    return {
        'Year': year,
        'August': grouped_dates['August'],
        'September': grouped_dates['September'],
        'October': grouped_dates['October']
    }

# Process data for each year and asset
results = []
for year, asset_path in assets:
    result = check_images_and_dates_by_month(year, asset_path)
    results.append(result)

# Convert results to a DataFrame
images_df = pd.DataFrame(results)

# Save to CSV for reference
output_path = '/content/drive/MyDrive/Crop Monitoring/crop_types_data/raster_data/image_availability_aug_sep_oct.csv'
images_df.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

# Display the first few rows
images_df.head()

Results saved to /content/drive/MyDrive/Crop Monitoring/crop_types_data/raster_data/image_availability_aug_sep_oct.csv


Unnamed: 0,Year,August,September,October
0,2018,[],[],[]
1,2019,"[2019-08-01, 2019-08-01, 2019-08-11, 2019-08-1...","[2019-09-10, 2019-09-10, 2019-09-15, 2019-09-1...","[2019-10-10, 2019-10-10, 2019-10-15, 2019-10-2..."
2,2020,"[2020-08-02, 2020-08-02, 2020-08-02, 2020-08-0...","[2020-09-03, 2020-09-03, 2020-09-03, 2020-09-0...","[2020-10-01, 2020-10-01, 2020-10-01, 2020-10-0..."
3,2023,"[2023-08-02, 2023-08-02, 2023-08-02, 2023-08-0...","[2023-09-01, 2023-09-01, 2023-09-03, 2023-09-0...","[2023-10-01, 2023-10-01, 2023-10-01, 2023-10-0..."


In [None]:
df = pd.DataFrame(valid_data)
df

Unnamed: 0,x,y,values
0,-16.424572,14.624887,"[0.2775346040725708, 0.4094661474227905, 0.416..."
1,-16.424482,14.624887,"[0.2775346040725708, 0.4094661474227905, 0.416..."
2,-16.425381,14.624797,"[0.19984690845012665, 0.29083263874053955, 0.2..."
3,-16.425291,14.624797,"[0.218816876411438, 0.3158193826675415, 0.3282..."
4,-16.424662,14.624797,"[0.2329733669757843, 0.32499346137046814, 0.34..."
...,...,...,...
122772,-16.523926,14.363657,"[0.3264831304550171, 0.6037906408309937, 0.489..."
122773,-16.523836,14.363657,"[0.3098805546760559, 0.520560622215271, 0.4647..."
122774,-16.523746,14.363657,"[0.25549739599227905, 0.4235258996486664, 0.38..."
122775,-16.523656,14.363657,"[0.26278743147850037, 0.4193413257598877, 0.39..."




In [None]:
#@title New Dataset with Numeric Labels
# Function to load clean data from GEE assets
def load_cleandata(asset_id):
    return ee.FeatureCollection(asset_id)

# Load the datasets
clean2018 = load_cleandata('projects/ee-kkidia3/assets/clean_data_label/labeled_clean_data_2018')
clean2019 = load_cleandata('projects/ee-kkidia3/assets/clean_data_label/labeled_clean_data_2019')
clean2020 = load_cleandata('projects/ee-kkidia3/assets/clean_data_label/labeled_clean_data_2020')
clean2023 = load_cleandata('projects/ee-kkidia3/assets/clean_data_label/labeled_clean_data_2023')

# Merge all datasets into one FeatureCollection
clean_data = clean2018.merge(clean2019).merge(clean2020).merge(clean2023)

In [None]:
#@title Class CSV

# Function to assign numeric labels server-side in Earth Engine
def assign_numeric_labels(fc):
    # Get unique values for 'Class'
    class_values = fc.distinct('Class').aggregate_array('Class')

    # Map over the feature collection to add numeric labels
    def add_numeric_labels(feature):
        # Get the index of the string values in the distinct lists
        class_numeric = class_values.indexOf(feature.get('Class'))

        # Add numeric labels as properties
        return feature.set({
            'Class_numeric': class_numeric
        })

    # Apply the function to each feature in the collection
    return fc.map(add_numeric_labels)

# Assign numeric labels to the merged data
labeled_clean_data = assign_numeric_labels(clean_data)

# Group by 'Class' and count the number of features in each class
def calculate_class_counts(fc):
    # Get the distinct 'Class' values
    class_values = fc.distinct('Class').aggregate_array('Class')

    # Create a list of features, each representing a unique class with its count and numeric label
    def class_summary(class_value):
        filtered_fc = fc.filter(ee.Filter.eq('Class', class_value))
        count = filtered_fc.size()

        # Get the numeric label
        class_numeric = filtered_fc.first().get('Class_numeric')

        # Create a feature with Class, Class_numeric, and Count
        return ee.Feature(None, {
            'Class': class_value,
            'Class_numeric': class_numeric,
            'Count': count
        })

    # Map over the distinct class values to create a summary collection
    summary_fc = ee.FeatureCollection(class_values.map(class_summary))

    return summary_fc

# Calculate class counts and numeric labels
class_counts_fc = calculate_class_counts(labeled_clean_data)

# Export the class counts to Google Drive as a single CSV file
task = ee.batch.Export.table.toDrive(
    collection=class_counts_fc,
    description='class_counts_export',
    folder='Crop_Coding',  # Folder in Google Drive where the file will be saved
    fileNamePrefix='class_counts',
    fileFormat='CSV'
)

# Start the export task
task.start()

In [None]:
#@title Sub_Class CSV
# Function to assign numeric labels server-side in Earth Engine
def assign_numeric_labels(fc):
    # Get unique values for 'Class', 'Name', and 'Sub_class'
    class_values = fc.distinct('Class').aggregate_array('Class')
    name_values = fc.distinct('Name').aggregate_array('Name')
    sub_class_values = fc.distinct('Sub_class').aggregate_array('Sub_class')

    # Map over the feature collection to add numeric labels
    def add_numeric_labels(feature):
        # Get the index of the string values in the distinct lists
        class_numeric = class_values.indexOf(feature.get('Class'))
        name_numeric = name_values.indexOf(feature.get('Name'))
        sub_class_numeric = sub_class_values.indexOf(feature.get('Sub_class'))

        # Add numeric labels as properties
        return feature.set({
            'Class_numeric': class_numeric,
            'Name_numeric': name_numeric,
            'Sub_class_numeric': sub_class_numeric
        })

    # Apply the function to each feature in the collection
    return fc.map(add_numeric_labels)

# Assign numeric labels to the merged data
labeled_clean_data = assign_numeric_labels(clean_data)

# Function to group by 'Sub_class', count the features, and export as one CSV
def export_grouped_by_subclass(fc):
    # Group by 'Sub_class' and calculate counts
    grouped_fc = fc.distinct('Sub_class').map(
        lambda feature: feature.set('Count', fc.filter(ee.Filter.eq('Sub_class', feature.get('Sub_class'))).size())
    )

    # Select relevant properties for the CSV output
    grouped_fc = grouped_fc.select([
        'Class', 'Class_numeric',
        'Sub_class', 'Sub_class_numeric',
        'Count'
    ])

    # Export as one CSV to Google Drive
    task = ee.batch.Export.table.toDrive(
        collection=grouped_fc,
        description='grouped_subclass_export',
        folder='Crop_Coding',
        fileNamePrefix='grouped_subclass_data',
        fileFormat='CSV'
    )
    task.start()

# Call the function to export grouped data by subclass in one CSV
export_grouped_by_subclass(labeled_clean_data)

In [None]:
#@title Name CSV
# Function to assign numeric labels server-side in Earth Engine
def assign_numeric_labels(fc):
    # Get unique values for 'Class', 'Name', and 'Sub_class'
    class_values = fc.distinct('Class').aggregate_array('Class')
    name_values = fc.distinct('Name').aggregate_array('Name')
    sub_class_values = fc.distinct('Sub_class').aggregate_array('Sub_class')

    # Map over the feature collection to add numeric labels
    def add_numeric_labels(feature):
        # Get the index of the string values in the distinct lists
        class_numeric = class_values.indexOf(feature.get('Class'))
        name_numeric = name_values.indexOf(feature.get('Name'))
        sub_class_numeric = sub_class_values.indexOf(feature.get('Sub_class'))

        # Add numeric labels as properties
        return feature.set({
            'Class_numeric': class_numeric,
            'Name_numeric': name_numeric,
            'Sub_class_numeric': sub_class_numeric
        })

    # Apply the function to each feature in the collection
    return fc.map(add_numeric_labels)

# Assign numeric labels to the merged data
labeled_clean_data = assign_numeric_labels(clean_data)

# Function to group by 'Name', calculate counts, and export to CSV
def export_grouped_by_name(fc):
    # Get the unique names
    unique_names = fc.distinct('Name').aggregate_array('Name').getInfo()

    # Create a list to store features for each group
    features = []

    # Iterate over each unique name
    for name in unique_names:
        # Filter the FeatureCollection by 'Name'
        name_fc = fc.filter(ee.Filter.eq('Name', name))

        # Get a sample feature to extract values for 'Class' and 'Sub_class'
        sample_feature = name_fc.first()

        # Count the number of occurrences of the current name
        count = name_fc.size()

        # Create a new feature containing the 'Name', 'Name_numeric', 'Class', 'Sub_class', and the count
        new_feature = ee.Feature(None, {
            'Name': name,
            'Name_numeric': sample_feature.get('Name_numeric'),
            'Class': sample_feature.get('Class'),
            'Sub_class': sample_feature.get('Sub_class'),
            'Count': count
        })

        # Add the new feature to the list
        features.append(new_feature)

    # Create a FeatureCollection from the list of features
    grouped_fc = ee.FeatureCollection(features)

    # Export the grouped data as a CSV to Google Drive
    task = ee.batch.Export.table.toDrive(
        collection=grouped_fc,
        description='grouped_by_name_data_export',
        folder='Crop_Coding',  # Name of the folder in your Google Drive
        fileNamePrefix='grouped_by_name_data',
        fileFormat='CSV'
    )
    task.start()

# Call the function to export grouped data by name
export_grouped_by_name(labeled_clean_data)

In [None]:
# @title Read polygons properties

# Define a function to fetch features in batches
def get_feature_batch(collection, batch_size=5000):
    # Initialize an empty list to store all features
    all_features = []

    # Get the total number of features in the collection
    total_count = collection.size().getInfo()

    # Loop through the collection in batches
    for i in range(0, total_count, batch_size):
        # Fetch the current batch of features
        batch = collection.toList(batch_size, i).map(lambda f: ee.Feature(f).toDictionary()).getInfo()

        # Add the batch to the list of all features
        all_features.extend(batch)

    return all_features

# Use the function to get all features from the collection
feature_properties = get_feature_batch(ee.FeatureCollection(clean_data))

# Convert to DataFrame
df = pd.DataFrame(feature_properties)

# Show the first few rows
df.head()




NameError: name 'pd' is not defined

In [None]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=df)

In [None]:
# Extract Labels
import pandas as pd

# Function to fetch features in batches
def get_feature_batch(collection, batch_size=5000):
    # Initialize an empty list to store all features
    all_features = []

    # Get the total number of features in the collection
    total_count = collection.size().getInfo()

    # Loop through the collection in batches
    for i in range(0, total_count, batch_size):
        # Fetch the current batch of features
        batch = collection.toList(batch_size, i).map(lambda f: ee.Feature(f).toDictionary()).getInfo()

        # Add the batch to the list of all features
        all_features.extend(batch)

    return all_features

# Use the function to get all features from the collection
feature_properties = get_feature_batch(ee.FeatureCollection(clean_data))

# Convert to DataFrame
df = pd.DataFrame(feature_properties)

# Group the DataFrame by "Name", and count occurrences, keeping "Class" and "Sub-class"
grouped_df = df.groupby(['Name', 'Class', 'Sub_class']).size().reset_index(name='Count')

# Export the grouped data to a CSV file
grouped_df.to_csv('grouped_polygons.csv', index=False)

# To display the resulting CSV file using Google Colab sheets
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=grouped_df)

# Show the first few rows
grouped_df.head()


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
figsize = (12, 1.2 * len(_df_8['Class'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(_df_8, x='Count', y='Class', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)