In [1]:
import pandas as pd
from pycaret.classification import *
from  sklearn.metrics import classification_report

In [2]:
df1 = pd.read_json('./newmodel/real/dataSet_Culture_10122023_2020_kirbi.json')
df2 = pd.read_json('./newmodel/real/dataSet_Culture_10122023_2021_kirbi.json')
df3 = pd.read_json('./newmodel/real/dataSet_Culture_10122023_2022_kirbi.json')
df4 = pd.read_json('./newmodel/real/dataSet_Culture_10122023_2023_kirbi.json')
df5 = pd.read_json('./newmodel/dataSet_Culture_21112023.json')

In [3]:
combined_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

In [4]:
combined_df['culture_name'].unique()

array(['Ячмень', 'Wheat', 'Морковь', 'Лук-Севок', 'Рапс', 'Люцерна',
       'Лён', 'Соя', 'Эспарцет', 'Potato', 'Вика', 'Донник',
       'Неизвестная Культура', 'Sunflower', 'Козлятник', 'Просо',
       'Тритикале', 'Сорго', 'Сидераты', 'Пшеница', 'Кукуруза', 'Свекла',
       'Томат', 'Картофель', 'Софлор', 'Лук', 'Клевер', 'Сил'],
      dtype=object)

In [5]:
crops = ['Ячмень', 'Wheat', 'Морковь', 'Лук-Севок', 'Рапс', 'Люцерна',

         'Лён', 'Соя', 'Эспарцет', 'Potato', 'Вика', 'Донник',

         'Неизвестная Культура', 'Sunflower', 'Козлятник', 'Просо',

         'Тритикале', 'Сорго', 'Сидераты', 'Пшеница', 'Кукуруза', 'Свекла',

         'Томат', 'Картофель', 'Софлор', 'Лук', 'Клевер', 'Сил']



# Translation dictionary for English names to Russian

translations = {

    'Wheat': 'Пшеница',

    'Potato': 'Картофель',

    'Sunflower': 'Подсолнечник'

}



# Translate the crop names to Russian

crops_russian = [translations.get(crop, crop) for crop in crops]



# Create a pandas Series from the list

crops_series = pd.Series(crops_russian)

crops_series

0                   Ячмень
1                  Пшеница
2                  Морковь
3                Лук-Севок
4                     Рапс
5                  Люцерна
6                      Лён
7                      Соя
8                 Эспарцет
9                Картофель
10                    Вика
11                  Донник
12    Неизвестная Культура
13            Подсолнечник
14               Козлятник
15                   Просо
16               Тритикале
17                   Сорго
18                Сидераты
19                 Пшеница
20                Кукуруза
21                  Свекла
22                   Томат
23               Картофель
24                  Софлор
25                     Лук
26                  Клевер
27                     Сил
dtype: object

In [6]:
combined_df['culture_name'] = combined_df['culture_name'].map(translations).fillna(combined_df['culture_name'])

combined_df['culture_name'].value_counts()

Пшеница                 122
Ячмень                   69
Кукуруза                 60
Свекла                   56
Люцерна                  44
Картофель                39
Эспарцет                 16
Соя                      10
Донник                    7
Неизвестная Культура      6
Сидераты                  3
Тритикале                 3
Подсолнечник              3
Вика                      3
Морковь                   3
Козлятник                 2
Лён                       2
Рапс                      2
Томат                     2
Лук                       2
Просо                     1
Сорго                     1
Лук-Севок                 1
Софлор                    1
Клевер                    1
Сил                       1
Name: culture_name, dtype: int64

In [7]:
combined_df.to_csv('kirbi_norm.csv', index=False)

In [8]:
# Counting the number of occurrences for each culture
culture_counts = combined_df['culture_name'].value_counts()

# Cultures with more than 5 members
cultures_more_than_5 = culture_counts[culture_counts > 5].index

# Filtering the dataset
filtered_data = combined_df[combined_df['culture_name'].isin(cultures_more_than_5)]

# Displaying the first few rows of the filtered dataset
filtered_data.head(), filtered_data['culture_name'].value_counts()


(   contour_id region  index_month_4  index_month_5  index_month_6  \
 0     16952.0   chui          0.383          0.492          0.261   
 1     16953.0   chui          0.522          0.684          0.537   
 2     16954.0   chui          0.592          0.610          0.429   
 6     16958.0   chui          0.050          0.225          0.147   
 8     16960.0   chui          0.112          0.190          0.289   
 
    index_month_7  index_month_8  elevation culture_name  
 0          0.065          0.071      634.0       Ячмень  
 1          0.162          0.091      633.0      Пшеница  
 2          0.140          0.096      625.0      Пшеница  
 6          0.087          0.057      637.0      Люцерна  
 8          0.514          0.546      621.0          Соя  ,
 Пшеница                 122
 Ячмень                   69
 Кукуруза                 60
 Свекла                   56
 Люцерна                  44
 Картофель                39
 Эспарцет                 16
 Соя                

In [9]:
unique_cultures = filtered_data['culture_name'].unique()

# Create a mapping from culture names to numeric values
culture_mapping = {culture: i+1 for i, culture in enumerate(unique_cultures)}

In [10]:
culture_mapping

{'Ячмень': 1,
 'Пшеница': 2,
 'Люцерна': 3,
 'Соя': 4,
 'Эспарцет': 5,
 'Картофель': 6,
 'Донник': 7,
 'Неизвестная Культура': 8,
 'Кукуруза': 9,
 'Свекла': 10}

In [11]:
filtered_data['culture_name'] = filtered_data['culture_name'].map(culture_mapping)


In [12]:
filtered_data_no_ik = filtered_data[filtered_data['region'] != 'ik']
df = filtered_data_no_ik.drop(columns=['contour_id', 'region'])

In [13]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)

# Perform KNN imputation
imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [14]:
df.isna().sum()

index_month_4     2
index_month_5    35
index_month_6     3
index_month_7    75
index_month_8     3
elevation        11
culture_name      0
dtype: int64

In [15]:
imputed_df['elevation'] = imputed_df['elevation'].astype(int)
imputed_df['culture_name'] = imputed_df['culture_name'].astype(int)

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def calculate_evaluation_scores(y_true, y_pred, problem_type):
    scores = {}
    
    if problem_type == 'regression':
        scores['MAE'] = mean_absolute_error(y_true, y_pred)
        scores['MSE'] = mean_squared_error(y_true, y_pred)
        scores['R-squared'] = r2_score(y_true, y_pred)
    elif problem_type == 'classification':
        scores['Accuracy'] = accuracy_score(y_true, y_pred)
        scores['Precision'] = precision_score(y_true, y_pred,average='weighted')
        scores['Recall'] = recall_score(y_true, y_pred,average='weighted')
        scores['F1-Score'] = f1_score(y_true, y_pred,average='weighted')
        # You can add more classification metrics as needed
    
    return scores


scores_classification = calculate_evaluation_scores(df['y_pshenica'], df['y_pshenica_pred'], problem_type='classification')
print("\nClassification Scores:")
print(scores_classification)

KeyError: 'y_pshenica'

In [18]:
# Counting the number of occurrences for each culture
culture_counts = imputed_df['culture_name'].value_counts()

# Cultures with more than 5 members
cultures_more_than_5 = culture_counts[culture_counts > 5].index

# Filtering the dataset
filtered_data = imputed_df[imputed_df['culture_name'].isin(cultures_more_than_5)]

# Displaying the first few rows of the filtered dataset
filtered_data.head(), filtered_data['culture_name'].value_counts()

(   index_month_4  index_month_5  index_month_6  index_month_7  index_month_8  \
 0          0.383          0.492          0.261          0.065          0.071   
 1          0.522          0.684          0.537          0.162          0.091   
 2          0.592          0.610          0.429          0.140          0.096   
 3          0.050          0.225          0.147          0.087          0.057   
 4          0.112          0.190          0.289          0.514          0.546   
 
    elevation  culture_name  
 0        634             1  
 1        633             2  
 2        625             2  
 3        637             3  
 4        621             4  ,
 2     104
 1      60
 9      60
 10     56
 3      41
 6      33
 4      10
 Name: culture_name, dtype: int64)

In [19]:
filtered_data['culture_name'].value_counts()

2     104
1      60
9      60
10     56
3      41
6      33
4      10
Name: culture_name, dtype: int64

In [20]:
filtered_data['culture_name'].value_counts()

2     104
1      60
9      60
10     56
3      41
6      33
4      10
Name: culture_name, dtype: int64

In [81]:
filtered_data.to_csv("kirbi_chuy.csv", index=False)

In [84]:
from imblearn.over_sampling import SMOTE
# Initialize SMOTE
smote = SMOTE()

# Apply SMOTE
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the new class distribution
new_class_distribution = y_resampled.value_counts()

new_class_distribution

1     104
2     104
3     104
4     104
6     104
9     104
10    104
Name: culture_name, dtype: int64

In [91]:
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

In [147]:
resampled_data.to_json('balanced_chuy_kirbi_cultureName.json')

In [97]:
train = resampled_data[ :650]
test = resampled_data[ 651:]

In [148]:
culture_mapping

{'Ячмень': 1,
 'Пшеница': 2,
 'Люцерна': 3,
 'Соя': 4,
 'Эспарцет': 5,
 'Картофель': 6,
 'Донник': 7,
 'Неизвестная Культура': 8,
 'Кукуруза': 9,
 'Свекла': 10}

In [98]:
from pycaret.classification import setup, compare_models

# Setting up PyCaret for classification
clf_setup = setup(data=train, 
                  target='culture_name', train_size=0.9)

Unnamed: 0,Description,Value
0,Session id,2310
1,Target,culture_name
2,Target type,Multiclass
3,Target mapping,"1: 0, 2: 1, 3: 2, 4: 3, 6: 4, 9: 5, 10: 6"
4,Original data shape,"(650, 7)"
5,Transformed data shape,"(650, 7)"
6,Transformed train set shape,"(585, 7)"
7,Transformed test set shape,"(65, 7)"
8,Numeric features,6
9,Preprocess,True


list

In [119]:
rf = create_model('gbc')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7288,0.9347,0.7288,0.7287,0.7231,0.6808,0.6828
1,0.6441,0.9307,0.6441,0.6475,0.6407,0.5816,0.5831
2,0.6441,0.9358,0.6441,0.6471,0.64,0.5806,0.5824
3,0.7119,0.9388,0.7119,0.7153,0.702,0.6616,0.6661
4,0.5932,0.9193,0.5932,0.5848,0.5868,0.5213,0.522
5,0.6724,0.9396,0.6724,0.6478,0.6544,0.6127,0.6157
6,0.6897,0.9413,0.6897,0.71,0.6832,0.6352,0.6411
7,0.6379,0.9146,0.6379,0.6349,0.6238,0.5714,0.5755
8,0.5862,0.8988,0.5862,0.5785,0.5786,0.5121,0.5134
9,0.6724,0.9252,0.6724,0.6894,0.6736,0.616,0.6182


In [120]:
rf_t = tune_model(rf)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7458,0.9102,0.7458,0.761,0.7276,0.6994,0.7025
1,0.7119,0.9491,0.7119,0.7103,0.7065,0.6608,0.6622
2,0.6441,0.9424,0.6441,0.6259,0.6197,0.5803,0.5841
3,0.7797,0.9404,0.7797,0.7942,0.7673,0.7412,0.7476
4,0.661,0.938,0.661,0.661,0.6539,0.6009,0.6032
5,0.7759,0.9288,0.7759,0.7786,0.7591,0.7339,0.7382
6,0.7586,0.9538,0.7586,0.7654,0.7516,0.7156,0.7186
7,0.7241,0.9401,0.7241,0.7088,0.7147,0.6747,0.6759
8,0.6379,0.9213,0.6379,0.6464,0.637,0.5741,0.5761
9,0.7931,0.9586,0.7931,0.8091,0.7944,0.7566,0.7587


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [122]:
f_rf = finalize_model(rf_t)

In [123]:
predict_model(f_rf)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,index_month_4,index_month_5,index_month_6,index_month_7,index_month_8,elevation,culture_name,prediction_label,prediction_score
175,0.038000,0.605000,0.397000,0.183000,-0.109000,726,2,2,0.9874
569,0.299183,0.354904,0.281527,0.233069,0.152822,621,6,6,0.9943
515,-0.052580,0.193648,0.022953,0.080164,0.615062,644,4,4,0.9973
194,-0.105000,-0.128000,0.166000,0.020000,-0.305000,754,9,9,0.9692
298,-0.034000,0.056000,0.205000,0.086000,0.060000,1094,2,2,0.9669
...,...,...,...,...,...,...,...,...,...
459,-0.027718,0.294340,0.182359,-0.114307,0.201666,630,3,3,0.9836
4,0.112000,0.190000,0.289000,0.514000,0.546000,621,4,4,0.9794
519,0.060665,0.145012,0.004987,0.204041,-0.253394,663,4,4,0.9969
139,0.082000,-0.435000,0.260000,0.161000,0.346000,595,3,3,0.9546


In [124]:
save_model(f_rf, 'gbc_classifier')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['index_month_4', 'index_month_5',
                                              'index_month_6', 'index_month_7',
                                              'index_month_8', 'elevation'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               c...
                                             criterion='friedman_mse', init=None,
                                             learning_rate=0.15, loss='log_loss',
                                             max_depth=5, max_features='sqrt',
                                             max

In [140]:
model = load_model('gbc_classifier')

Transformation Pipeline and Model Successfully Loaded


In [141]:
Xtest = test.copy().reset_index(drop=True)
ytest = Xtest.pop('culture_name')

In [142]:
ypred = model.predict(Xtest)

In [143]:
Xtest['culture_name'] = ytest
Xtest['pred'] = ypred

In [144]:
scores_classification = calculate_evaluation_scores(Xtest['culture_name'], Xtest['pred'], problem_type='classification')

In [145]:
scores_classification

{'Accuracy': 0.7012987012987013,
 'Precision': 0.8339517625231913,
 'Recall': 0.7012987012987013,
 'F1-Score': 0.7461850649350649}