In [1]:
import numpy as np
import pandas as pd
import pickle5 as pickle
from collections import Counter

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error, log_loss
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
pd.options.display.max_columns = 999

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_json('./data/dataSet_Culture_06102023-POINT.json')

In [3]:
# Extract the year from 'Analysis Date'
df['year'] = df['Analysis Date'].apply(lambda x: x.split('-')[0])

df['polygon_x'] = df['polygon'].apply(lambda x: x['x'])
df['polygon_y'] = df['polygon'].apply(lambda x: x['y'])
df['month'] = df['Analysis Date'].apply(lambda x: x.split('-')[1])
df['day'] = df['Analysis Date'].apply(lambda x: x.split('-')[2])

df['vegetation'] = (df['indextype'] == 'NDVI') & (df['averagevalue'] >= 0.15)
df = df.drop(['polygon', 'soil_id'] , axis = 1)

# Modify the 'combined' column to include year
df['combined'] = df['polygon_x'].astype(str) + '_' + df['polygon_y'].astype(str) + '_' + df['year'].astype(str)

# Assign unique ID based on the grouped column
df['id'] = df.groupby('combined').ngroup() + 1

# Drop the combined column and other temporary columns
df = df.drop(columns=['combined', 'polygon_x', 'polygon_y', 'year'])

In [4]:
df = df[df['indextype'] == 'NDVI']
df = df.drop(columns=['indextype', 'year contour', 'month', 'day', 'vegetation', 'type_culture_name'])
df = df.rename(columns={'culture_name': 'class'})
df = df.rename(columns={'averagevalue': 'red'})
df = df.rename(columns={'Analysis Date': 'date'})
df['date'] = pd.to_datetime(df['date'])

In [5]:
label_encoder = LabelEncoder() 
  
# Encode labels in column 'class'. 
df['class']= label_encoder.fit_transform(df['class']) 
df['district_name']= label_encoder.fit_transform(df['district_name']) 
df['soil_name']= label_encoder.fit_transform(df['soil_name']) 
df['class'].unique() 

array([ 2, 16, 21,  4, 11, 13,  7, 20, 18,  3,  0,  5, 17,  8, 12, 10, 15,
       14,  6,  1, 19,  9])

In [6]:
# Convert date to multiple columns (year, month, day)
df['year'] = pd.to_datetime(df['date']).dt.year
df['month'] = pd.to_datetime(df['date']).dt.month
df['day'] = pd.to_datetime(df['date']).dt.day
df.drop('date', axis=1, inplace=True)

# Splitting data    
X = df.drop(['class', 'id', 'year', 'day'], axis=1) 
y = df['class']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Pivot table
pivot_df = df.pivot_table(index='id', columns='month', values='red', aggfunc='mean')

# Fill NaN values (assuming you want to fill with zeros, adjust if needed)

# Rename columns as needed
pivot_df.columns = [f'red_{col}_month' for col in pivot_df.columns]

# Reset the index so 'id' becomes a column
pivot_df = pivot_df.reset_index()

# Assuming each 'id' has a unique 'class', get the 'class' value for each 'id' and add to the pivot dataframe
pivot_df['class'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['class'])
pivot_df['elevation_contour'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['elevation_contour'])
pivot_df['district_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['district_name'])
pivot_df['soil_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['soil_name'])

# Reordering columns
pivot_df = pivot_df[['red_4_month', 'red_5_month', 'red_6_month', 'red_8_month', 'red_9_month', 'id', 'elevation_contour', 'district_name', 'soil_name', 'class']]


In [8]:
# Splitting the data
X = pivot_df.drop(['id', 'class'], axis=1)  # Features excluding 'id' and 'class'
y = pivot_df['class']  # Target variable

In [9]:
# First, split the data into training and temp (which will be further split into validation and test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42) 

# Now, split the temp data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Setting up the dataset for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)


In [20]:
# Parameters
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': 23,
    'max_depth': 6,  # control tree depth
    'feature_fraction': 0.8,  # use only 80% of features for each tree
    'bagging_fraction': 0.8,  # use only 80% of data for each tree
    'bagging_freq': 5,  # perform bagging every 5 rounds
    'min_data_in_leaf': 20,  # minimum samples in a leaf
    'lambda_l1': 0.2,  # L1 regularization
    'lambda_l2': 0.1  # L2 regularization
}

In [21]:
# Training with early stopping
num_round = 1000
bst = lgb.train(params, 
                train_data, 
                num_round, 
                valid_sets=[val_data], 
                callbacks=[lgb.early_stopping(stopping_rounds=100)])  # stops if validation doesn't improve for 20 rounds

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026150 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1200
[LightGBM] [Info] Number of data points in the train set: 796, number of used features: 8
[LightGBM] [Info] Start training from score -5.293305
[LightGBM] [Info] Start training from score -5.580987
[LightGBM] [Info] Start training from score -3.278402
[LightGBM] [Info] Start training from score -6.679599
[LightGBM] [Info] Start training from score -1.702865
[LightGBM] [Info] Start training from score -5.070161
[LightGBM] [Info] Start training from score -5.986452
[LightGBM] [Info] Start training from score -2.125722
[LightGBM] [Info] Start training from score -6.679599
[LightGBM] [Info] Start training from score -6.679599
[LightGBM] [Info] Start training from score -5.986452
[LightGBM] [Info] Start training from score -1.470113
[L

In [22]:
y_pred_test = bst.predict(X_test)
y_pred_train = bst.predict(X_train)

In [23]:
#argmax() method 
y_pred_test = [np.argmax(line) for line in y_pred_test]
y_pred_train = [np.argmax(line) for line in y_pred_train]
#printing the predictions

#using precision score for error metrics
print('TEST = ', precision_score(y_pred_test,y_test, average=None).mean())
print('TRAIN = ', precision_score(y_pred_train, y_train, average=None).mean())

TEST =  0.22050592166691238
TRAIN =  0.9607424407648714
