In [2]:
import pickle
def loadfile(filename):
    with open(f'{filename}.pickle', 'rb') as fp:
        data = pickle.load(fp)
    return data

In [3]:
import numpy as np

X_train = loadfile('X_train')
X_test = loadfile('X_test')
X_val = loadfile('X_val')
y_train = loadfile('y_train')
y_test = loadfile('y_test')
y_val = loadfile('y_val')

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_val.shape)
print(y_val.shape)

(57863, 11)
(57863,)
(19288, 11)
(19288,)
(19288, 11)
(19288,)


In [5]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score

model = RandomForestClassifier()
model.fit(X_train, y_train)
predict = model.predict(X_val)
print(classification_report(y_val, predict))
balanced_accuracy_score(y_val, predict)
print(f'f_0.5: {fbeta_score(y_val, predict, beta=0.5)}')
print(f'f_2: {fbeta_score(y_val, predict, beta=2)}')

              precision    recall  f1-score   support

           0       0.74      0.18      0.29      1600
           1       0.93      0.99      0.96     17688

    accuracy                           0.93     19288
   macro avg       0.84      0.59      0.62     19288
weighted avg       0.91      0.93      0.91     19288

f_0.5: 0.942604501607717
f_2: 0.9809268863978584


In [7]:
import numpy as np
from imblearn.under_sampling import RandomUnderSampler

# Assuming X_train and y_train are your training data and labels
# X_train.shape should be (n_samples, n_features)
# y_train.shape should be (n_samples,)

# Define the undersampling ratio
desired_ratio = {0: np.sum(y_train == 0), 1: int(4*np.sum(y_train == 0))}

# Random undersampling
undersampler = RandomUnderSampler(sampling_strategy=desired_ratio, random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

In [8]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score

model = RandomForestClassifier()
model.fit(X_train_resampled, y_train_resampled)
predict = model.predict(X_val)
print(classification_report(y_val, predict))
balanced_accuracy_score(y_val, predict)
print(f'f_0.5: {fbeta_score(y_val, predict, beta=0.5)}')
print(f'f_2: {fbeta_score(y_val, predict, beta=2)}')


              precision    recall  f1-score   support

           0       0.46      0.36      0.40      1600
           1       0.94      0.96      0.95     17688

    accuracy                           0.91     19288
   macro avg       0.70      0.66      0.68     19288
weighted avg       0.90      0.91      0.91     19288

f_0.5: 0.9466711200178134
f_2: 0.9577068198456946


In [17]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_resampled, y_train_resampled)
predict = model.predict(X_test)
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.44      0.36      0.40      1510
           1       0.95      0.96      0.95     17778

    accuracy                           0.91     19288
   macro avg       0.69      0.66      0.68     19288
weighted avg       0.91      0.91      0.91     19288



In [44]:
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [60, 80, 100],
    'max_depth': [3, 4, 5],
    'min_samples_leaf': [1, 2, 3],
}

grid_search = GridSearchCV(model, param_grid, cv=10, scoring= 'f1_micro')
grid_search.fit(X_train_resampled, y_train_resampled)
print(grid_search.best_params_)

{'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 100}


In [45]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=2) #param o tren kia
model.fit(X_train_resampled, y_train_resampled)
predict = model.predict(X_val)
print(classification_report(y_val, predict))

              precision    recall  f1-score   support

           0       0.76      0.12      0.21      1567
           1       0.93      1.00      0.96     17721

    accuracy                           0.93     19288
   macro avg       0.84      0.56      0.59     19288
weighted avg       0.91      0.93      0.90     19288



In [9]:
import pandas as pd

new_sample = [[2017, 3, 10, 4, 420.7, 1175, 1175, 69.82, 22, 5148, 5148]]
x = pd.DataFrame(new_sample)
    # x.columns = __columns

predict = model.predict(x)[0]
predict

1

In [8]:
import pickle
import joblib
pickle.dump(model, open('flask/model.pkl', 'wb'))
joblib.dump(model, open('flask/model.sav', 'wb'))


In [10]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
selector = SelectKBest(f_classif, k=11)
X_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
X_val_new = selector.transform(X_val)

In [12]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_selected, y_train_resampled)
predict = model.predict(X_val_new)
# model.score(X_selected, y_train_resampled)
print(classification_report(y_val, predict))
print(f'f_0.5: {fbeta_score(y_val, predict, beta=0.5)}')
print(f'f_2: {fbeta_score(y_val, predict, beta=2)}')

              precision    recall  f1-score   support

           0       0.46      0.37      0.41      1600
           1       0.94      0.96      0.95     17688

    accuracy                           0.91     19288
   macro avg       0.70      0.67      0.68     19288
weighted avg       0.90      0.91      0.91     19288

f_0.5: 0.9473684210526315
f_2: 0.9572957746478873


TEST TRÊN TẬP TEST

In [14]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
selector = SelectKBest(f_classif, k=11)
X_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_new = selector.transform(X_test)

In [16]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

model = RandomForestClassifier()
model.fit(X_selected, y_train_resampled)
predict = model.predict(X_test_new)
print(classification_report(y_test, predict))
print(f'f_0.5: {fbeta_score(y_test, predict, beta=0.5)}')
print(f'f_2: {fbeta_score(y_test, predict, beta=2)}')
print(f'ROC: {roc_auc_score(y_test, predict)}')

              precision    recall  f1-score   support

           0       0.44      0.35      0.39      1510
           1       0.95      0.96      0.95     17778

    accuracy                           0.91     19288
   macro avg       0.69      0.65      0.67     19288
weighted avg       0.91      0.91      0.91     19288

f_0.5: 0.9487941783329266
f_2: 0.9588135061208017
ROC: 0.6546101700218814
