In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from bayes_opt import BayesianOptimization

%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# df = pd.read_json('./data/dataSet_Culture_06102023-POINT.json')

df = pd.read_csv('./felix_clening.csv')

In [3]:
# Extract the year from 'Analysis Date'
df['year'] = df['Analysis Date'].apply(lambda x: x.split('-')[0])

df['polygon_x'] = df['polygon'].apply(lambda x: x['x'])
df['polygon_y'] = df['polygon'].apply(lambda x: x['y'])
df['month'] = df['Analysis Date'].apply(lambda x: x.split('-')[1])
df['day'] = df['Analysis Date'].apply(lambda x: x.split('-')[2])

df['vegetation'] = (df['indextype'] == 'NDVI') & (df['averagevalue'] >= 0.15)
df = df.drop(['polygon', 'soil_id'] , axis = 1)

# Modify the 'combined' column to include year
df['combined'] = df['polygon_x'].astype(str) + '_' + df['polygon_y'].astype(str) + '_' + df['year'].astype(str)

# Assign unique ID based on the grouped column
df['id'] = df.groupby('combined').ngroup() + 1

# Drop the combined column and other temporary columns
df = df.drop(columns=['combined', 'polygon_x', 'polygon_y', 'year'])

KeyError: 'Analysis Date'

In [None]:
df = df[df['indextype'] == 'NDVI']
df = df.drop(columns=['indextype', 'year contour', 'month', 'day', 'vegetation', 'type_culture_name'])
df = df.rename(columns={'culture_name': 'class'})
df = df.rename(columns={'averagevalue': 'red'})
df = df.rename(columns={'Analysis Date': 'date'})
df['date'] = pd.to_datetime(df['date'])

In [None]:
label_encoder = LabelEncoder() 
  
# Encode labels in column 'class'. 
df['class']= label_encoder.fit_transform(df['class']) 
df['district_name']= label_encoder.fit_transform(df['district_name']) 
df['soil_name']= label_encoder.fit_transform(df['soil_name']) 
df['class'].unique() 

In [None]:
# Convert date to multiple columns (year, month, day)
df['year'] = pd.to_datetime(df['date']).dt.year
df['month'] = pd.to_datetime(df['date']).dt.month
df['day'] = pd.to_datetime(df['date']).dt.day
df.drop('date', axis=1, inplace=True)

# Splitting data    
X = df.drop(['class', 'id', 'year', 'day'], axis=1) 
y = df['class']

In [None]:
# Pivot table
pivot_df = df.pivot_table(index='id', columns='month', values='red', aggfunc='mean')

# Fill NaN values (assuming you want to fill with zeros, adjust if needed)
# pivot_df = pivot_df.fillna(0)

# Rename columns as needed
pivot_df.columns = [f'red_{col}_month' for col in pivot_df.columns]

# Reset the index so 'id' becomes a column
pivot_df = pivot_df.reset_index()

# Assuming each 'id' has a unique 'class', get the 'class' value for each 'id' and add to the pivot dataframe
pivot_df['class'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['class'])
pivot_df['elevation_contour'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['elevation_contour'])
pivot_df['district_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['district_name'])
pivot_df['soil_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['soil_name'])

# Reordering columns
pivot_df = pivot_df[['red_4_month', 'red_5_month', 'red_6_month', 'red_8_month', 'red_9_month', 'id', 'elevation_contour', 'district_name', 'soil_name', 'class']]

In [4]:
# Splitting the data
X = df.drop(['class'], axis=1)  # Features excluding 'id' and 'class'
y = df['class']  # Target variable

In [9]:
class_counts = y.value_counts()

# Identify the classes with 99 or fewer samples
small_sample_classes = class_counts[class_counts <= 99].index

# Create a mask for these classes
small_sample_mask = y.isin(small_sample_classes)

# Set the values for these classes to 0
y[small_sample_mask] = 0


In [5]:
y.value_counts()

class
3    140
5    116
2     97
4     92
1     74
6     59
Name: count, dtype: int64

In [11]:
class_counts = y.value_counts()
single_sample_classes = class_counts[class_counts <= 100].index
filter_mask = ~y.isin(single_sample_classes)
X = X[filter_mask]
y = y[filter_mask]

In [None]:
y.value_counts()

In [None]:
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
y = pd.Series(y)
print('Class distribution before undersampling:\n', y.value_counts())


In [None]:
rus = RandomUnderSampler(sampling_strategy='auto')
X_undersampled, y_undersampled = rus.fit_resample(X, y)

In [None]:
y_undersampled = pd.Series(y_undersampled)
print('Class distribution after undersampling:\n', y_undersampled.value_counts())


In [None]:
# y = pd.Series(y)

# # Print class distribution before SMOTE
# print('Class distribution before SMOTE:\n', y.value_counts())

# # Apply SMOTE
# smote = SMOTE(sampling_strategy='auto', k_neighbors=5)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# # Convert y_resampled to a pandas Series (for value_counts)
# y_resampled = pd.Series(y_resampled)

# # Print class distribution after SMOTE
# print('Class distribution after SMOTE:\n', y_resampled.value_counts())

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss

# Objective function using log loss
def objective_log_loss(n_estimators, max_depth, min_samples_split, max_features):
    model = RandomForestClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        max_features=min(max_features, 0.999),  # Fraction, must be <= 1.0
    )
    # Use negative log loss as the scoring parameter
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_log_loss').mean()

# Hyperparameters bounds for Bayesian optimization
param_bounds = {
    'n_estimators': (10, 250),
    'max_depth': (1, 50),
    'min_samples_split': (2, 25),
    'max_features': (0.1, 0.999),
}

# Bayesian optimization
optimizer_log_loss = BayesianOptimization(f=objective_log_loss, pbounds=param_bounds, random_state=1)
optimizer_log_loss.maximize(init_points=5, n_iter=15)

# Best parameters found
best_params_log_loss = optimizer_log_loss.max['params']

# Retrain the model with the best parameters
optimized_rf_log_loss = RandomForestClassifier(
    n_estimators=int(best_params_log_loss['n_estimators']),
    max_depth=int(best_params_log_loss['max_depth']),
    min_samples_split=int(best_params_log_loss['min_samples_split']),
    max_features=best_params_log_loss['max_features']
)
optimized_rf_log_loss.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_probs = optimized_rf_log_loss.predict_proba(X_test)

# Calculate and print the log loss for the test set
test_log_loss = log_loss(y_test, y_pred_probs)
print(f'Test Log Loss: {test_log_loss}')


|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-2.205   [0m | [0m21.43    [0m | [0m0.7476   [0m | [0m2.003    [0m | [0m82.56    [0m |
| [95m2        [0m | [95m-1.383   [0m | [95m8.191    [0m | [95m0.183    [0m | [95m6.284    [0m | [95m92.93    [0m |
| [0m3        [0m | [0m-1.429   [0m | [0m20.44    [0m | [0m0.5844   [0m | [0m11.64    [0m | [0m174.5    [0m |
| [0m4        [0m | [0m-1.748   [0m | [0m11.02    [0m | [0m0.8894   [0m | [0m2.63     [0m | [0m170.9    [0m |
| [0m5        [0m | [0m-1.866   [0m | [0m21.45    [0m | [0m0.6023   [0m | [0m5.229    [0m | [0m57.54    [0m |
| [0m6        [0m | [0m-1.495   [0m | [0m20.2     [0m | [0m0.8951   [0m | [0m12.22    [0m | [0m175.3    [0m |
| [95m7        [0m | [95m-1.382   [0m | [95m25.09    [0m | [95m0.1      [0m | [95m10.07    [0m | [95m16