In [None]:
!pip install xgboost --quiet

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb  # Import XGBoost

import ast  #Import the ast library
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier


In [None]:
# Load Data (Update file paths as needed)
train_data = pd.read_csv('/kaggle/input/ml-4-eo-s-2025-crop-classification-challenge/train.csv', sep=',')
test_data= pd.read_csv('/kaggle/input/ml-4-eo-s-2025-crop-classification-challenge/test.csv', sep=',')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:


# Assuming train_data and test_data are already defined
train_df = train_data.copy()
test_df = test_data.copy()

# Function to safely evaluate string representations of lists
def safe_eval(x):
    try:
        return ast.literal_eval(x)  # Convert string list to actual list
    except:
        return x  # Return original value if conversion fails

# Function to process the DataFrame
def process_dataframe(df):
    for col in df.columns:
        df[col] = df[col].apply(safe_eval)
    return df

# Process training and test data
train_df_processed = process_dataframe(train_df)
test_df_processed = process_dataframe(test_df)

# Check for columns that still have lists (nested data)
list_columns = [col for col in train_df_processed.columns if isinstance(train_df_processed[col].iloc[0], list)]
print(f"Columns containing lists: {list_columns}")

# Flatten list columns (expand into multiple columns)
def expand_list_columns(df, list_columns):
    for col in list_columns:
        max_length = max(df[col].apply(lambda x: len(x) if isinstance(x, list) else 0))  # Find max list length
        for i in range(max_length):
            df[f"{col}_{i+1}"] = df[col].apply(lambda x: x[i] if isinstance(x, list) and len(x) > i else None)
        df.drop(columns=[col], inplace=True)  # Drop original list column
    return df

# Apply expansion to train and test data
train_df_final = expand_list_columns(train_df_processed, list_columns)
test_df_final = expand_list_columns(test_df_processed, list_columns)

# Verify the processed DataFrames
print("\nProcessed training data (after expansion):")
print(train_df_final.head())
print(train_df_final.dtypes)

print("\nProcessed test data (after expansion):")
print(test_df_final.head())
print(test_df_final.dtypes)


In [None]:
# Check for missing values in both datasets
print("Missing values in training data:")
print(train_df_final.isnull().sum().sum())

print("\nMissing values in test data:")
print(test_df_final.isnull().sum().sum())


In [None]:
print("Columns with missing values in test data:")
print(test_df_final.isnull().sum()[test_df_final.isnull().sum() > 0])


In [None]:
test_df_final[['EVI_1', 'EVI_2', 'EVI_3', 'EVI_4']] = test_df_final[['EVI_1', 'EVI_2', 'EVI_3', 'EVI_4']].fillna(test_df_final.mean())


In [None]:
print("Columns with missing values in test data:")
print(test_df_final.isnull().sum()[test_df_final.isnull().sum() > 0])

In [None]:
print("\nChecking for infinity values in test data:")
print((test_df_final == np.inf).sum().sum() + (test_df_final == -np.inf).sum().sum())


In [None]:
# test_df_final.replace([np.inf, -np.inf], 1e6, inplace=True)


In [None]:
test_df=test_df_final.copy()
train_df=train_df_final.copy()

In [None]:
import pandas as pd



# Encode crop labels
label_mapping = {
    'Maize': 1,
    'Sorghum': 2,
    'Tree': 3,
    'Built': 4,
    'Bare_Soil': 5,
    'Water': 6
}

train_df['encoded_label'] = train_df['crop_label'].map(label_mapping)

# Print the encoded training dataframe
print("Encoded Training Data:")
print(train_df)



In [None]:
train_df.isna().sum().sum()

In [None]:
test_df.isna().sum().sum()

In [None]:
test_df.shape

In [None]:
train_df.shape

In [None]:


# Replace positive and negative infinity with a large number (e.g., 1e6)
test_df.replace([np.inf, -np.inf], 1e6, inplace=True)

# If you want to check that no inf values remain:
print(test_df.isin([np.inf, -np.inf]).sum().sum())  # Should print 0 if no more inf values


In [None]:
test_df.isna().sum().sum()

In [None]:
test_df.isna().sum().sum()

In [None]:
test_df.shape

In [None]:
# Check the data types of each column
print(test_df.dtypes)

# Identify columns with non-numeric data
non_numeric_columns = test_df.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_columns)


In [None]:
print(test_df['EVI_1'].head())


In [None]:
train_df.head()

In [None]:
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df.shape


In [None]:
test_df.head()

In [None]:


# # Load your training and test datasets
# train_df = pd.read_csv('train.csv')  # Make sure to load your actual data
# test_df = pd.read_csv('test.csv')  # Similarly, load your test data for submission

# Separate features (X) and target (y) from the training data
X = train_df.drop(columns=['crop_label', 'encoded_label'])  # Dropping non-feature columns
y = train_df['encoded_label']

# Split data into train and test sets for model validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Standardize features using StandardScaler (only fit on training data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform the training data
X_test_scaled = scaler.transform(X_test)  # Transform the test set using the same scaler

# Prepare the test data (test_df) for final predictions (field_id is not a feature)
X_test_final = test_df.drop(columns=['field_id'])  # Drop non-feature columns (ID column)
X_test_final_scaled = scaler.transform(X_test_final)  # Transform the external test dataset (test_df)

# Define the best parameter grids for Random Forest and XGBoost

# Random Forest Hyperparameters
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# XGBoost Hyperparameters
xgb_param_grid = {
    "n_estimators": [150, 200, 250, 300],
    "max_depth": [5, 6, 7, 8],
    "learning_rate": [0.05, 0.1, 0.2],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "gamma": [0, 0.1, 0.2],
    "scale_pos_weight": [1]  # No severe class imbalance
}

# Initialize models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42)

# Perform GridSearchCV for RandomForest
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='f1_weighted', verbose=2, n_jobs=-1)
rf_grid_search.fit(X_train_scaled, y_train)

# Perform GridSearchCV for XGBoost
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='f1_weighted', verbose=2, n_jobs=-1)
xgb_grid_search.fit(X_train_scaled, y_train)

# Get the best models after tuning
best_rf_model = rf_grid_search.best_estimator_
best_xgb_model = xgb_grid_search.best_estimator_

# Create a Voting Classifier using the best models from GridSearchCV
voting_clf = VotingClassifier(estimators=[('rf', best_rf_model), ('xgb', best_xgb_model)], voting='hard')

# Train the voting classifier on the training data
voting_clf.fit(X_train_scaled, y_train)

# Make predictions with the voting classifier on the test data
voting_predictions = voting_clf.predict(X_test_final_scaled)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'field_id': test_df['field_id'],  # Extract 'field_id' from test_df to keep track of the rows
    'encoded_label': voting_predictions  # The final predicted labels after voting
})

# Save the submission file
submission_df.to_csv('submission1.csv', index=False)

# Optionally print the best parameters and scores for both models
print(f"Best Random Forest Hyperparameters: {rf_grid_search.best_params_}")
print(f"Best XGBoost Hyperparameters: {xgb_grid_search.best_params_}")
