In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_json('data/dataSet_Culture_21112023.json')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index_month_4  307 non-null    float64
 1   index_month_5  273 non-null    float64
 2   index_month_6  306 non-null    float64
 3   index_month_7  233 non-null    float64
 4   index_month_8  307 non-null    float64
 5   elevation      308 non-null    int64  
 6   culture_name   308 non-null    object 
dtypes: float64(5), int64(1), object(1)
memory usage: 17.0+ KB


In [4]:
df.describe()

Unnamed: 0,index_month_4,index_month_5,index_month_6,index_month_7,index_month_8,elevation
count,307.0,273.0,306.0,233.0,307.0,308.0
mean,0.006173,0.082842,0.070938,0.086176,0.070293,792.253247
std,0.086019,0.21418,0.211829,0.230185,0.208884,203.811149
min,-0.273,-0.435,-0.45,-0.42,-0.446,576.0
25%,-0.051,-0.059,-0.048,-0.068,-0.0695,654.0
50%,0.004,0.047,0.052,0.082,0.052,713.0
75%,0.06,0.192,0.19975,0.204,0.1605,915.0
max,0.286,0.761,0.865,0.773,0.757,1571.0


In [5]:
df.head()

Unnamed: 0,index_month_4,index_month_5,index_month_6,index_month_7,index_month_8,elevation,culture_name
0,-0.047,-0.298,-0.293,0.148,0.105,724,Ячмень
1,0.016,0.27,0.298,0.26,0.757,729,Люцерна
2,-0.034,-0.258,-0.017,0.131,-0.086,626,Пшеница
3,0.068,-0.186,-0.112,0.286,0.12,619,Люцерна
4,0.11,0.439,-0.3,-0.02,0.211,966,Пшеница


In [6]:
df.isnull().sum()

index_month_4     1
index_month_5    35
index_month_6     2
index_month_7    75
index_month_8     1
elevation         0
culture_name      0
dtype: int64

In [7]:
df['culture_name'].unique()

array(['Ячмень', 'Люцерна', 'Пшеница', 'Кукуруза', 'Свекла', 'Соя',
       'Томат', 'Картофель', 'Софлор', 'Лук', 'Клевер', 'Сил'],
      dtype=object)

In [8]:
codes, uniques = pd.factorize(df['culture_name'])
df['culture_name'] = codes


In [9]:
culture_id_map = {name: id for id, name in enumerate(uniques)}
print(culture_id_map)

{'Ячмень': 0, 'Люцерна': 1, 'Пшеница': 2, 'Кукуруза': 3, 'Свекла': 4, 'Соя': 5, 'Томат': 6, 'Картофель': 7, 'Софлор': 8, 'Лук': 9, 'Клевер': 10, 'Сил': 11}


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Handling missing values
# Imputing missing values with the median for features with low missing percentage
# imputer = SimpleImputer(strategy='median')
# df[['index_month_4', 'index_month_5', 'index_month_6', 'index_month_7', 'index_month_8']] = imputer.fit_transform(df[['index_month_4', 'index_month_5', 'index_month_6', 'index_month_7', 'index_month_8']])

# Dropping columns with a high percentage of missing values
# df = df.drop(columns=['index_month_5', 'index_month_7'])

# Encoding the target variable 'culture_name'
label_encoder = LabelEncoder()

# Splitting the dfset into training and testing sets
X = df.drop('culture_name', axis=1)
y = df['culture_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Displaying the first few rows of the processed dfset
X_train.head(), y_train.head()


(     index_month_4  index_month_5  index_month_6  index_month_7  \
 126          0.044         -0.118         -0.243          0.329   
 109         -0.041            NaN         -0.093          0.066   
 247          0.061         -0.092         -0.292            NaN   
 234         -0.016          0.034          0.037            NaN   
 202         -0.132            NaN            NaN            NaN   
 
      index_month_8  elevation  
 126          0.326        738  
 109          0.053        621  
 247         -0.420       1074  
 234         -0.053        951  
 202         -0.215        713  ,
 126    3
 109    0
 247    4
 234    3
 202    1
 Name: culture_name, dtype: int64)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initializing the models
# logistic_regression = LogisticRegression(max_iter=1000, random_state=42)
# random_forest = RandomForestClassifier(random_state=42)
gradient_boosting = GradientBoostingClassifier(random_state=42)

# Training the models
# logistic_regression.fit(X_train, y_train)
# random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)

# Making predictions with the models
# lr_predictions = logistic_regression.predict(X_test)
# rf_predictions = random_forest.predict(X_test)
gb_predictions = gradient_boosting.predict(X_test)

# Evaluating the models
# lr_accuracy = accuracy_score(y_test, lr_predictions)
# rf_accuracy = accuracy_score(y_test, rf_predictions)
gb_accuracy = accuracy_score(y_test, gb_predictions)

# Classification reports
# lr_report = classification_report(y_test, lr_predictions)
# rf_report = classification_report(y_test, rf_predictions)
gb_report = classification_report(y_test, gb_predictions)

# lr_accuracy, rf_accuracy, gb_accuracy, lr_report, rf_report, gb_report



ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [13]:
from xgboost import XGBClassifier

# Initializing the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Training the XGBoost classifier
xgb_classifier.fit(X_train, y_train)

# Making predictions with the XGBoost classifier
xgb_predictions = xgb_classifier.predict(X_test)

# Evaluating the XGBoost classifier
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_report = classification_report(y_test, xgb_predictions)

xgb_accuracy, xgb_report



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.45161290322580644,
 '              precision    recall  f1-score   support\n\n           0       0.40      0.55      0.46        11\n           1       0.20      0.25      0.22         4\n           2       0.77      0.50      0.61        20\n           3       0.25      0.38      0.30         8\n           4       0.44      0.54      0.48        13\n           5       0.00      0.00      0.00         2\n           6       0.00      0.00      0.00         1\n           7       1.00      0.33      0.50         3\n\n    accuracy                           0.45        62\n   macro avg       0.38      0.32      0.32        62\nweighted avg       0.50      0.45      0.46        62\n')