In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import preprocessing 

import lightgbm as lgb

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
from collections import Counter
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score
pd.options.display.max_columns = 999

In [3]:
df = pd.read_json('./data/dataSet_Culture_06102023-POINT.json')

In [4]:
# Extract the year from 'Analysis Date'
df['year'] = df['Analysis Date'].apply(lambda x: x.split('-')[0])

df['polygon_x'] = df['polygon'].apply(lambda x: x['x'])
df['polygon_y'] = df['polygon'].apply(lambda x: x['y'])
df['month'] = df['Analysis Date'].apply(lambda x: x.split('-')[1])
df['day'] = df['Analysis Date'].apply(lambda x: x.split('-')[2])

df['vegetation'] = (df['indextype'] == 'NDVI') & (df['averagevalue'] >= 0.15)
df = df.drop(['polygon', 'soil_id'] , axis = 1)

# Modify the 'combined' column to include year
df['combined'] = df['polygon_x'].astype(str) + '_' + df['polygon_y'].astype(str) + '_' + df['year'].astype(str)

# Assign unique ID based on the grouped column
df['id'] = df.groupby('combined').ngroup() + 1

# Drop the combined column and other temporary columns
df = df.drop(columns=['combined', 'polygon_x', 'polygon_y', 'year'])

In [5]:
df

Unnamed: 0,indextype,averagevalue,Analysis Date,year contour,elevation_contour,district_name,soil_name,culture_name,type_culture_name,month,day,vegetation,id
0,NDVI,0.217,2020-06-04,2020,1544,Kemin district,Горные светло-каштановые,Картофель,Яровая,06,04,True,986
1,NDVI,0.215,2020-08-03,2020,1544,Kemin district,Горные светло-каштановые,Картофель,Яровая,08,03,True,986
2,NDVI,0.150,2020-09-02,2020,1544,Kemin district,Горные светло-каштановые,Картофель,Яровая,09,02,True,986
3,NDVI,0.052,2021-04-10,2021,1544,Kemin district,Горные светло-каштановые,Картофель,Яровая,04,10,False,987
4,NDVI,0.056,2021-05-05,2021,1544,Kemin district,Горные светло-каштановые,Картофель,Яровая,05,05,False,987
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22977,VARI,0.055,2022-05-08,2022,672,Yssyk-Ata district,"Сероземно-луговые, местами солончаковатые",Кукуруза,Яровая,05,08,False,752
22978,VARI,0.072,2022-06-07,2022,672,Yssyk-Ata district,"Сероземно-луговые, местами солончаковатые",Кукуруза,Яровая,06,07,False,752
22979,VARI,-0.002,2022-07-02,2022,672,Yssyk-Ata district,"Сероземно-луговые, местами солончаковатые",Кукуруза,Яровая,07,02,False,752
22980,VARI,0.006,2022-08-01,2022,672,Yssyk-Ata district,"Сероземно-луговые, местами солончаковатые",Кукуруза,Яровая,08,01,False,752


In [6]:
df = df[df['indextype'] == 'NDVI']
df = df.drop(columns=['indextype', 'year contour', 'month', 'day', 'vegetation', 'type_culture_name'])
df = df.rename(columns={'culture_name': 'class'})
df = df.rename(columns={'averagevalue': 'red'})
df = df.rename(columns={'Analysis Date': 'date'})
df['date'] = pd.to_datetime(df['date'])

In [7]:
df = df[df['date'] <= '2021-01-01']

In [8]:
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'class'. 
df['class']= label_encoder.fit_transform(df['class']) 
df['district_name']= label_encoder.fit_transform(df['district_name']) 
df['soil_name']= label_encoder.fit_transform(df['soil_name']) 
df['class'].unique() 

array([ 2, 13,  4,  8, 10, 16,  7,  3,  5, 14,  0, 12, 11,  6,  1, 15,  9])

In [9]:
# Convert date to multiple columns (year, month, day)
df['year'] = pd.to_datetime(df['date']).dt.year
df['month'] = pd.to_datetime(df['date']).dt.month
df['day'] = pd.to_datetime(df['date']).dt.day
df.drop('date', axis=1, inplace=True)

# Splitting data    
X = df.drop('class', axis=1) 
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Training the model
clf = RandomForestClassifier(n_estimators=220, random_state=42)
clf.fit(X_train, y_train)

# Predictions and accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.0f}%")

Accuracy: 72%


In [11]:
# Pivot table
pivot_df = df.pivot_table(index='id', columns='month', values='red', aggfunc='mean')

# Fill NaN values (assuming you want to fill with zeros, adjust if needed)
# pivot_df = pivot_df.fillna(0)

# Rename columns as needed
pivot_df.columns = [f'red_{col}_month' for col in pivot_df.columns]

# Reset the index so 'id' becomes a column
pivot_df = pivot_df.reset_index()

# Assuming each 'id' has a unique 'class', get the 'class' value for each 'id' and add to the pivot dataframe
pivot_df['class'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['class'])
pivot_df['elevation_contour'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['elevation_contour'])
pivot_df['district_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['district_name'])
pivot_df['soil_name'] = pivot_df['id'].map(df.drop_duplicates(subset='id').set_index('id')['soil_name'])

# Reordering columns
pivot_df = pivot_df[['red_4_month', 'red_5_month', 'red_6_month', 'red_8_month', 'red_9_month', 'id', 'elevation_contour', 'district_name', 'soil_name', 'class']]


In [12]:
# Splitting the data
X = pivot_df.drop(['id', 'class'], axis=1)  # Features excluding 'id' and 'class'
y = pivot_df['class']  # Target variable

In [13]:
# class_counts = y.value_counts()
# single_sample_classes = class_counts[class_counts == 1].index
# filter_mask = ~y.isin(single_sample_classes)
# X = X[filter_mask]
# y = y[filter_mask]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
#Converting the dataset in proper LGB format
d_train=lgb.Dataset(X_train, label=y_train)

In [16]:
#setting up the parameters
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclass' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
params['num_class']=16

In [17]:
clf=lgb.train(params,d_train,100)   #обучение модели на 100 эпохах

[LightGBM] [Fatal] Label must be in [0, 16), but found 16 in label


LightGBMError: Label must be in [0, 16), but found 16 in label

In [None]:
y_pred

In [None]:
y_pred=clf.predict(X_test)
y_pred_train = clf.predict(X_train)

In [None]:
#argmax() method 
y_pred = [np.argmax(line) for line in y_pred]
#printing the predictions

#using precision score for error metrics
precision_score(y_pred,y_test,average=None).mean()
# 0.9545454545454546

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
print(classification_report(y_test, y_pred))