In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_json('./data/dataSet_Culture_06102023-POINT.json')

In [3]:
# Extract the year from 'Analysis Date'
df['year'] = df['Analysis Date'].apply(lambda x: x.split('-')[0])

df['polygon_x'] = df['polygon'].apply(lambda x: x['x'])
df['polygon_y'] = df['polygon'].apply(lambda x: x['y'])
df['month'] = df['Analysis Date'].apply(lambda x: x.split('-')[1])
df['day'] = df['Analysis Date'].apply(lambda x: x.split('-')[2])

df['vegetation'] = (df['indextype'] == 'NDVI') & (df['averagevalue'] >= 0.15)
df = df.drop(['polygon', 'soil_id'] , axis = 1)

# Modify the 'combined' column to include year
df['combined'] = df['polygon_x'].astype(str) + '_' + df['polygon_y'].astype(str) + '_' + df['year'].astype(str)

# Assign unique ID based on the grouped column
df['id'] = df.groupby('combined').ngroup() + 1

# Drop the combined column and other temporary columns
df = df.drop(columns=['combined', 'polygon_x', 'polygon_y', 'year'])

In [4]:
df = df[df['indextype'] == 'NDVI']
df = df.drop(columns=['indextype', 'year contour', 'month', 'day', 'vegetation', 'type_culture_name'])
df = df.rename(columns={'culture_name': 'class'})
df = df.rename(columns={'averagevalue': 'red'})
df = df.rename(columns={'Analysis Date': 'date'})
df['date'] = pd.to_datetime(df['date'])

In [5]:
label_encoder = LabelEncoder() 
  
# Encode labels in column 'class'. 
df['class']= label_encoder.fit_transform(df['class']) 
df['district_name']= label_encoder.fit_transform(df['district_name']) 
df['soil_name']= label_encoder.fit_transform(df['soil_name']) 
df['class'].unique() 

array([ 2, 16, 21,  4, 11, 13,  7, 20, 18,  3,  0,  5, 17,  8, 12, 10, 15,
       14,  6,  1, 19,  9])

In [6]:
# Convert date to multiple columns (year, month, day)
df['year'] = pd.to_datetime(df['date']).dt.year
df['month'] = pd.to_datetime(df['date']).dt.month
df['day'] = pd.to_datetime(df['date']).dt.day
df.drop('date', axis=1, inplace=True)

# Splitting data    
X = df.drop(['class', 'id', 'year', 'day'], axis=1) 
y = df['class']

In [7]:
# Pivot table
pivot_df = df.pivot_table(index='id', columns='month', values='red', aggfunc='mean')

# Fill NaN values (assuming you want to fill with zeros, adjust if needed)
# pivot_df = pivot_df.fillna(0)

# Rename columns as needed
pivot_df.columns = [f'red_{col}_month' for col in pivot_df.columns]

# Reset the index so 'id' becomes a column
pivot_df = pivot_df.reset_index()

# Assuming each 'id' has a unique 'class', get the 'class' value for each 'id' and add to the pivot dataframe
pivot_df['class'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['class'])
pivot_df['elevation_contour'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['elevation_contour'])
pivot_df['district_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['district_name'])
pivot_df['soil_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['soil_name'])

# Reordering columns
pivot_df = pivot_df[['red_4_month', 'red_5_month', 'red_6_month', 'red_8_month', 'red_9_month', 'id', 'elevation_contour', 'district_name', 'soil_name', 'class']]

In [8]:
# Splitting the data
X = pivot_df.drop(['id', 'class'], axis=1)  # Features excluding 'id' and 'class'
y = pivot_df['class']  # Target variable

In [9]:
class_counts = y.value_counts()

# Identify the classes with 99 or fewer samples
small_sample_classes = class_counts[class_counts <= 99].index

# Create a mask for these classes
small_sample_mask = y.isin(small_sample_classes)

# Set the values for these classes to 0
y[small_sample_mask] = 0


In [10]:
y.value_counts()

class
11    231
21    193
4     183
13    163
7     120
0     105
Name: count, dtype: int64

In [11]:
class_counts = y.value_counts()
single_sample_classes = class_counts[class_counts <= 100].index
filter_mask = ~y.isin(single_sample_classes)
X = X[filter_mask]
y = y[filter_mask]

In [12]:
y.value_counts()

class
11    231
21    193
4     183
13    163
7     120
0     105
Name: count, dtype: int64

In [13]:
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [14]:
y = pd.Series(y)
print('Class distribution before undersampling:\n', y.value_counts())


Class distribution before undersampling:
 class
11    231
21    193
4     183
13    163
7     120
0     105
Name: count, dtype: int64


In [15]:
rus = RandomUnderSampler(sampling_strategy='auto')
X_undersampled, y_undersampled = rus.fit_resample(X, y)

In [16]:
y_undersampled = pd.Series(y_undersampled)
print('Class distribution after undersampling:\n', y_undersampled.value_counts())


Class distribution after undersampling:
 class
0     105
4     105
7     105
11    105
13    105
21    105
Name: count, dtype: int64


In [17]:
# y = pd.Series(y)

# # Print class distribution before SMOTE
# print('Class distribution before SMOTE:\n', y.value_counts())

# # Apply SMOTE
# smote = SMOTE(sampling_strategy='auto', k_neighbors=5)
# X_resampled, y_resampled = smote.fit_resample(X, y)

# # Convert y_resampled to a pandas Series (for value_counts)
# y_resampled = pd.Series(y_resampled)

# # Print class distribution after SMOTE
# print('Class distribution after SMOTE:\n', y_resampled.value_counts())

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_undersampled, y_undersampled, test_size=0.2, stratify=y_undersampled)

In [19]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
criterion = ['gini', 'entropy']
class_weight = ['balanced', 'balanced_subsample', None]


In [20]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are already defined
unique_classes = set(y_train)
binary_classifications = {}
evaluation_metrics = {}
trained_classifiers = {}

param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion,
               'class_weight':class_weight 
}

for u_class in unique_classes:
    # Convert the labels for one-vs-all classification
    y_train_binary = [1 if label == u_class else 0 for label in y_train]
    y_test_binary = [1 if label == u_class else 0 for label in y_test]

    # Train a Gradient Boosting classifier using Randomized Search CV
    gbm = RandomForestClassifier()
    random_search = RandomizedSearchCV(
        gbm, param_distributions=param_dist, n_iter=200, scoring='accuracy', 
        cv=5, verbose=0,  n_jobs=-1
    )
    random_search.fit(X_train, y_train_binary)
    
    # Storing the trained classifier
    trained_classifiers[u_class] = random_search.best_estimator_
    
    # Evaluate on the test set
    y_pred = random_search.predict(X_test)
    test_accuracy = accuracy_score(y_test_binary, y_pred)
    evaluation_metrics[u_class] = {
        "Best Parameters": random_search.best_params_,
        "Best CV Score": random_search.best_score_,
        "Test Accuracy": test_accuracy
    }

    # Print results
    print(f"Class {u_class} - Best Parameters:", random_search.best_params_)
    print(f"Class {u_class} - Best CV Score:", random_search.best_score_)
    print(f"Class {u_class} - Test Set Score:", test_accuracy)


Class 0 - Best Parameters: {'n_estimators': 56, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'gini', 'class_weight': None, 'bootstrap': False}
Class 0 - Best CV Score: 0.865089108910891
Class 0 - Test Set Score: 0.873015873015873
Class 4 - Best Parameters: {'n_estimators': 80, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'gini', 'class_weight': None, 'bootstrap': True}
Class 4 - Best CV Score: 0.8353465346534653
Class 4 - Test Set Score: 0.8333333333333334
Class 7 - Best Parameters: {'n_estimators': 56, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'gini', 'class_weight': None, 'bootstrap': True}
Class 7 - Best CV Score: 0.8512277227722771
Class 7 - Test Set Score: 0.8333333333333334
Class 11 - Best Parameters: {'n_estimators': 72, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 2, 'criter

In [21]:
trained_classifiers

{0: RandomForestClassifier(bootstrap=False, max_depth=4, min_samples_leaf=2,
                        min_samples_split=5, n_estimators=56),
 4: RandomForestClassifier(max_depth=4, min_samples_leaf=2, min_samples_split=5,
                        n_estimators=80),
 7: RandomForestClassifier(max_depth=4, min_samples_leaf=2, n_estimators=56),
 11: RandomForestClassifier(criterion='entropy', max_depth=2, min_samples_leaf=2,
                        min_samples_split=5, n_estimators=72),
 13: RandomForestClassifier(bootstrap=False, max_depth=4, min_samples_leaf=2,
                        min_samples_split=5, n_estimators=17),
 21: RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=2,
                        min_samples_leaf=2, n_estimators=41)}

In [22]:
probabilities_batch = {u_class: [] for u_class in trained_classifiers.keys()}

for u_class, gbm in trained_classifiers.items():
    probs = gbm.predict_proba(X_test)[:, 1]
    probabilities_batch[u_class] = probs

final_class_predictions = []
for i in range(len(X_test)):
    final_class = max(probabilities_batch, key=lambda x: probabilities_batch[x][i])
    final_class_predictions.append(final_class)

In [23]:
accuracy_score(final_class_predictions, y_test)

0.4126984126984127

In [24]:
probabilities_batch = {u_class: [] for u_class in trained_classifiers.keys()}

for u_class, gbm in trained_classifiers.items():
    probs = gbm.predict_proba(X_train)[:, 1]
    probabilities_batch[u_class] = probs

final_class_predictions = []
for i in range(len(X_train)):
    final_class = max(probabilities_batch, key=lambda x: probabilities_batch[x][i])
    final_class_predictions.append(final_class)

In [25]:
accuracy_score(final_class_predictions, y_train)

0.6428571428571429