In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

%matplotlib inline
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../data/felix_cleaned.csv')

In [3]:
# df = df[df['class']!=5]
# df = df[df['class']!=4]
# df = df[df['class']!=3]
df = df[df['class']!=1]
df = df[df['class']!=2]
df = df[df['class']!=6]


In [4]:
# Splitting the data
X = df.drop(['class'], axis=1)  # Features excluding 'id' and 'class'
y = df['class']  # Target variable

In [5]:
X

Unnamed: 0,ndvi_4_month,ndvi_5_month,ndvi_6_month,ndvi_7_month,ndvi_8_month,elevation_contour,soil_name_Аллювиальные луговые тугайных лесов и кустарников,soil_name_Горно-долинные каштановые и темно-каштановые,"soil_name_Горно-долинные светло-каштановые, местами в сочетании с лугово-светло-каштановыми слабозасоленными",soil_name_Горные светло-каштановые,...,"soil_name_Сероземы северные (малокарбонатные) светлые, местами слабо- и среднесолончаковатые",district_name_Chui district,district_name_Kemin district,district_name_Moskva district,district_name_Panfilov district,district_name_Sokuluk district,district_name_Yssyk-Ata district,district_name_Zhayil district,mean_ndvi,mean_z_score
1,0.00725,0.17225,0.17550,0.023250,-0.019500,656,False,False,False,False,...,False,False,False,False,True,False,False,False,0.071750,-0.003184
3,0.01625,0.19625,0.07575,0.034000,0.098500,685,False,False,False,False,...,False,False,False,False,True,False,False,False,0.084150,0.047033
4,0.01275,0.10075,0.14975,0.028500,-0.012000,692,False,False,False,False,...,False,False,False,False,True,False,False,False,0.055950,-0.142433
9,0.00250,0.18300,0.15000,0.049200,-0.008875,687,False,False,False,False,...,False,False,False,False,False,False,False,True,0.075165,0.014300
10,-0.01820,-0.01100,-0.01080,0.093375,0.295750,686,False,False,False,False,...,False,False,False,False,False,False,False,True,0.069825,-0.114996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,0.06950,0.04525,0.10100,0.104500,0.076750,770,False,False,False,False,...,False,True,False,False,False,False,False,False,0.079400,0.111944
115,0.07975,0.04750,0.03925,0.079500,0.083000,1030,False,False,False,False,...,False,True,False,False,False,False,False,False,0.065800,-0.048906
118,-0.02575,0.01800,0.01950,0.128750,0.220000,979,False,False,False,False,...,False,False,True,False,False,False,False,False,0.072100,-0.071330
119,-0.01475,0.01275,0.02875,0.091250,0.201250,982,False,False,False,False,...,False,False,True,False,False,False,False,False,0.063850,-0.141674


In [6]:
y.value_counts()

class
3    29
4    24
5    20
Name: count, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [8]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]
criterion = ['gini', 'entropy']
class_weight = ['balanced', 'balanced_subsample', None]


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are already defined
unique_classes = set(y_train)
binary_classifications = {}
evaluation_metrics = {}
trained_classifiers = {}

param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion,
               'class_weight':class_weight 
}

for u_class in unique_classes:
    # Convert the labels for one-vs-all classification
    y_train_binary = [1 if label == u_class else 0 for label in y_train]
    y_test_binary = [1 if label == u_class else 0 for label in y_test]

    # Train a Gradient Boosting classifier using Randomized Search CV
    gbm = RandomForestClassifier()
    random_search = RandomizedSearchCV(
        gbm, param_distributions=param_dist, n_iter=200, scoring='accuracy', 
        cv=5, verbose=0,  n_jobs=-1
    )
    random_search.fit(X_train, y_train_binary)
    
    # Storing the trained classifier
    trained_classifiers[u_class] = random_search.best_estimator_
    
    # Evaluate on the test set
    y_pred = random_search.predict(X_test)
    test_accuracy = accuracy_score(y_test_binary, y_pred)
    evaluation_metrics[u_class] = {
        "Best Parameters": random_search.best_params_,
        "Best CV Score": random_search.best_score_,
        "Test Accuracy": test_accuracy
    }

    # Print results
    print(f"Class {u_class} - Best Parameters:", random_search.best_params_)
    print(f"Class {u_class} - Best CV Score:", random_search.best_score_)
    print(f"Class {u_class} - Test Set Score:", test_accuracy)


Class 3 - Best Parameters: {'n_estimators': 48, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'entropy', 'class_weight': None, 'bootstrap': True}
Class 3 - Best CV Score: 0.7846153846153847
Class 3 - Test Set Score: 0.75
Class 4 - Best Parameters: {'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'entropy', 'class_weight': 'balanced_subsample', 'bootstrap': False}
Class 4 - Best CV Score: 0.876923076923077
Class 4 - Test Set Score: 0.625
Class 5 - Best Parameters: {'n_estimators': 64, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 4, 'criterion': 'gini', 'class_weight': 'balanced', 'bootstrap': True}
Class 5 - Best CV Score: 0.7846153846153847
Class 5 - Test Set Score: 0.75


In [10]:
trained_classifiers

{3: RandomForestClassifier(criterion='entropy', max_depth=4, min_samples_leaf=2,
                        min_samples_split=5, n_estimators=48),
 4: RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample',
                        criterion='entropy', max_depth=4, min_samples_leaf=2,
                        n_estimators=10),
 5: RandomForestClassifier(class_weight='balanced', max_depth=4,
                        min_samples_split=5, n_estimators=64)}

In [11]:
probabilities_batch = {u_class: [] for u_class in trained_classifiers.keys()}

for u_class, gbm in trained_classifiers.items():
    probs = gbm.predict_proba(X_test)[:, 1]
    probabilities_batch[u_class] = probs

final_class_predictions = []
for i in range(len(X_test)):
    final_class = max(probabilities_batch, key=lambda x: probabilities_batch[x][i])
    final_class_predictions.append(final_class)

In [23]:
y_test.unique()

array([3, 4, 5])

In [12]:
accuracy_score(final_class_predictions, y_test)

0.5

In [13]:
probabilities_batch = {u_class: [] for u_class in trained_classifiers.keys()}

for u_class, gbm in trained_classifiers.items():
    probs = gbm.predict_proba(X_train)[:, 1]
    probabilities_batch[u_class] = probs

final_class_predictions = []
for i in range(len(X_train)):
    final_class = max(probabilities_batch, key=lambda x: probabilities_batch[x][i])
    final_class_predictions.append(final_class)

In [14]:
accuracy_score(final_class_predictions, y_train)

0.9538461538461539

In [15]:
probabilities_batch

{3: array([0.71032739, 0.88828724, 0.24646706, 0.37580523, 0.76221869,
        0.15922727, 0.16948168, 0.68862955, 0.78126324, 0.16711025,
        0.02866342, 0.7848493 , 0.13763438, 0.41968962, 0.66994227,
        0.40173531, 0.48239812, 0.1352709 , 0.8365768 , 0.23268431,
        0.17546118, 0.81502627, 0.25008701, 0.79893034, 0.40325859,
        0.7951279 , 0.77630229, 0.28550505, 0.4920891 , 0.17834925,
        0.16328277, 0.14779088, 0.74887975, 0.16817959, 0.29831973,
        0.63726938, 0.04183743, 0.42808717, 0.46358086, 0.69360336,
        0.57660137, 0.12804062, 0.71739694, 0.0456949 , 0.59425629,
        0.18346666, 0.76069292, 0.84808181, 0.77770456, 0.03416216,
        0.25413121, 0.21987721, 0.29750765, 0.18330853, 0.01762671,
        0.2579006 , 0.12694546, 0.43161197, 0.26009463, 0.60071034,
        0.08250045, 0.08402218, 0.62265694, 0.1520193 , 0.14512795]),
 4: array([0.        , 0.        , 0.92953437, 0.1088138 , 0.        ,
        0.        , 0.75732155, 0.      

In [16]:
# import joblib

# for class_label, model in trained_classifiers.items():
#     filename = f'model_{class_label}.pkl'
#     joblib.dump(model, filename)


In [17]:
# loaded_models = {}
# for class_label in trained_classifiers.keys():
#     filename = f'model_{class_label}.pkl'
#     loaded_models[class_label] = joblib.load(filename)

# # Example of making a prediction
# sample_data = X_test[:1] # Replace with actual data
# predictions = {class_label: model.predict(sample_data) for class_label, model in loaded_models.items()}
# 

In [18]:
# probabilities_batch = {u_class: [] for u_class in loaded_models.keys()}
# # 
# # Get probabilities for the positive class
# for u_class, model in loaded_models.items():
#     # Getting probabilities for all samples in A
#     probs = model.predict_proba(X_train[:1])  # Assuming index 1 is the positive class
#     probabilities_batch[u_class] = probs

In [19]:
# x_train[:1]

In [20]:
# import numpy as np
# import re

# # Your input string
# probabilities_batch_str = str(probabilities_batch)

# # Extracting data using regex
# matches = re.findall(r'(\d+): array\(\[\[(.+?)\]\]\)', probabilities_batch_str)

# # Converting to dictionary with numpy arrays
# probabilities_batch = {int(u_class): np.array([float(num) for num in probs.split(', ')]) for u_class, probs in matches}

# # Now, probabilities_batch is a dictionary with class labels as keys and numpy arrays as values

# # Extract the probabilities of the positive class for each model
# positive_class_probs = {u_class: probs[1] for u_class, probs in probabilities_batch.items()}

# # Finding the class with the maximum probability of the positive class
# final_class = max(positive_class_probs, key=positive_class_probs.get)

In [21]:
# final_class

In [22]:
X[:1].to_json('kanaloh.json')