In [None]:
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import optuna
import time
import re
import warnings

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vlade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vlade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vlade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("IMDB Dataset.csv", on_bad_lines='skip')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [None]:
def preprocess_text(text):
    # Удаление специальных символов и приведение к нижнему регистру
    text = re.sub(r'\W', ' ', text)
    #text = text.lower()

    # Токенизация
    tokens = nltk.word_tokenize(text)

    # Удаление стоп-слов
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Лемматизация
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

In [None]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
data['sentiment']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [None]:
preprocessed_texts = [preprocess_text(text) for text in data['review']]

### TF-IDF

In [None]:
# Векторизация с помощью TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_texts)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2, random_state=42)

In [None]:
# Обучение модели логистической регрессии
clf = LogisticRegression(max_iter=100)
clf.fit(X_train, y_train)



# Предсказание меток для тестовых данных
y_pred = clf.predict(X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4369  592]
 [ 443 4596]]


In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
model = RandomForestClassifier(n_estimators=1000,
                              criterion='log_loss',
                              max_depth=20,
                              min_samples_split=10,
                              min_samples_leaf=1,
                              random_state = 0)


model.fit(X_train, y_train)

y_pred = model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f'Accuracy(train) = {accuracy}')

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy(test) = {accuracy}')

Accuracy(train) = 0.91985
Accuracy(test) = 0.8511


In [None]:
def objective(trial):
    start_time = time.time()
    # Определяем гиперпараметры, которые будет оптимизировать Optuna
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])

    # Создаем модель RandomForestClassifier с подобранными параметрами
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=0
    )

    # Обучаем модель
    model.fit(X_train, y_train)

    # Предсказываем и оцениваем точность на тестовой выборке
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    end_time = time.time()
    print(f'Время: {(end_time - start_time)/60:.2f} минут')

    return accuracy

# Создаем объект Optuna и запускаем оптимизацию
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)



[I 2024-08-22 18:50:33,422] A new study created in memory with name: no-name-2e362d7b-6ff7-442f-bfe3-817d57a1b136
[I 2024-08-22 18:52:55,706] Trial 0 finished with value: 0.8532 and parameters: {'n_estimators': 584, 'max_depth': 26, 'min_samples_split': 19, 'min_samples_leaf': 9, 'criterion': 'entropy'}. Best is trial 0 with value: 0.8532.


Время: 2.37 минут


[I 2024-08-22 18:55:00,755] Trial 1 finished with value: 0.8535 and parameters: {'n_estimators': 507, 'max_depth': 28, 'min_samples_split': 5, 'min_samples_leaf': 8, 'criterion': 'gini'}. Best is trial 1 with value: 0.8535.


Время: 2.08 минут


[I 2024-08-22 18:57:16,792] Trial 2 finished with value: 0.8523 and parameters: {'n_estimators': 563, 'max_depth': 27, 'min_samples_split': 3, 'min_samples_leaf': 9, 'criterion': 'log_loss'}. Best is trial 1 with value: 0.8535.


Время: 2.27 минут


[I 2024-08-22 18:58:34,910] Trial 3 finished with value: 0.8523 and parameters: {'n_estimators': 356, 'max_depth': 22, 'min_samples_split': 7, 'min_samples_leaf': 4, 'criterion': 'gini'}. Best is trial 1 with value: 0.8535.


Время: 1.30 минут


[I 2024-08-22 19:00:22,110] Trial 4 finished with value: 0.8501 and parameters: {'n_estimators': 731, 'max_depth': 15, 'min_samples_split': 8, 'min_samples_leaf': 4, 'criterion': 'gini'}. Best is trial 1 with value: 0.8535.


Время: 1.79 минут


[I 2024-08-22 19:01:00,621] Trial 5 finished with value: 0.8496 and parameters: {'n_estimators': 165, 'max_depth': 22, 'min_samples_split': 9, 'min_samples_leaf': 2, 'criterion': 'gini'}. Best is trial 1 with value: 0.8535.


Время: 0.64 минут


[I 2024-08-22 19:01:19,775] Trial 6 finished with value: 0.8389 and parameters: {'n_estimators': 382, 'max_depth': 5, 'min_samples_split': 14, 'min_samples_leaf': 5, 'criterion': 'entropy'}. Best is trial 1 with value: 0.8535.


Время: 0.32 минут


[I 2024-08-22 19:05:45,024] Trial 7 finished with value: 0.8574 and parameters: {'n_estimators': 855, 'max_depth': 29, 'min_samples_split': 5, 'min_samples_leaf': 4, 'criterion': 'log_loss'}. Best is trial 7 with value: 0.8574.


Время: 4.42 минут


[I 2024-08-22 19:07:47,080] Trial 8 finished with value: 0.8492 and parameters: {'n_estimators': 717, 'max_depth': 17, 'min_samples_split': 9, 'min_samples_leaf': 5, 'criterion': 'entropy'}. Best is trial 7 with value: 0.8574.


Время: 2.03 минут


[I 2024-08-22 19:08:08,869] Trial 9 finished with value: 0.8397 and parameters: {'n_estimators': 200, 'max_depth': 11, 'min_samples_split': 7, 'min_samples_leaf': 6, 'criterion': 'entropy'}. Best is trial 7 with value: 0.8574.


Время: 0.36 минут


[I 2024-08-22 19:12:27,668] Trial 10 finished with value: 0.8568 and parameters: {'n_estimators': 992, 'max_depth': 23, 'min_samples_split': 13, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 7 with value: 0.8574.


Время: 4.31 минут


[I 2024-08-22 19:18:19,430] Trial 11 finished with value: 0.8593 and parameters: {'n_estimators': 988, 'max_depth': 30, 'min_samples_split': 13, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 11 with value: 0.8593.


Время: 5.86 минут


[I 2024-08-22 19:23:19,041] Trial 12 finished with value: 0.8582 and parameters: {'n_estimators': 977, 'max_depth': 28, 'min_samples_split': 16, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 11 with value: 0.8593.


Время: 4.99 минут


[I 2024-08-22 19:28:58,265] Trial 13 finished with value: 0.8612 and parameters: {'n_estimators': 983, 'max_depth': 30, 'min_samples_split': 17, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 5.65 минут


[I 2024-08-22 19:33:51,495] Trial 14 finished with value: 0.8589 and parameters: {'n_estimators': 869, 'max_depth': 30, 'min_samples_split': 20, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.89 минут


[I 2024-08-22 19:37:34,922] Trial 15 finished with value: 0.8558 and parameters: {'n_estimators': 841, 'max_depth': 25, 'min_samples_split': 17, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 3.72 минут


[I 2024-08-22 19:39:33,104] Trial 16 finished with value: 0.8481 and parameters: {'n_estimators': 737, 'max_depth': 12, 'min_samples_split': 12, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 1.97 минут


[I 2024-08-22 19:43:35,134] Trial 17 finished with value: 0.8525 and parameters: {'n_estimators': 996, 'max_depth': 20, 'min_samples_split': 15, 'min_samples_leaf': 7, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.03 минут


[I 2024-08-22 19:50:55,767] Trial 18 finished with value: 0.8592 and parameters: {'n_estimators': 882, 'max_depth': 30, 'min_samples_split': 18, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 7.34 минут


[I 2024-08-22 19:54:34,067] Trial 19 finished with value: 0.8553 and parameters: {'n_estimators': 653, 'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 3.64 минут


[I 2024-08-22 19:58:21,830] Trial 20 finished with value: 0.8526 and parameters: {'n_estimators': 906, 'max_depth': 19, 'min_samples_split': 12, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 3.80 минут


[I 2024-08-22 20:05:29,832] Trial 21 finished with value: 0.8588 and parameters: {'n_estimators': 902, 'max_depth': 30, 'min_samples_split': 18, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 7.13 минут


[I 2024-08-22 20:12:28,472] Trial 22 finished with value: 0.8603 and parameters: {'n_estimators': 791, 'max_depth': 30, 'min_samples_split': 16, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 6.98 минут


[I 2024-08-22 20:17:05,882] Trial 23 finished with value: 0.8548 and parameters: {'n_estimators': 789, 'max_depth': 24, 'min_samples_split': 16, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.62 минут


[I 2024-08-22 20:23:51,915] Trial 24 finished with value: 0.8582 and parameters: {'n_estimators': 936, 'max_depth': 27, 'min_samples_split': 14, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 6.77 минут


[I 2024-08-22 20:25:08,608] Trial 25 finished with value: 0.8449 and parameters: {'n_estimators': 802, 'max_depth': 7, 'min_samples_split': 15, 'min_samples_leaf': 10, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 1.28 минут


[I 2024-08-22 20:29:31,689] Trial 26 finished with value: 0.8574 and parameters: {'n_estimators': 658, 'max_depth': 27, 'min_samples_split': 20, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.38 минут


[I 2024-08-22 20:37:20,702] Trial 27 finished with value: 0.8595 and parameters: {'n_estimators': 942, 'max_depth': 30, 'min_samples_split': 11, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 7.82 минут


[I 2024-08-22 20:41:02,066] Trial 28 finished with value: 0.8536 and parameters: {'n_estimators': 801, 'max_depth': 20, 'min_samples_split': 17, 'min_samples_leaf': 2, 'criterion': 'entropy'}. Best is trial 13 with value: 0.8612.


Время: 3.69 минут


[I 2024-08-22 20:44:00,571] Trial 29 finished with value: 0.8559 and parameters: {'n_estimators': 619, 'max_depth': 26, 'min_samples_split': 11, 'min_samples_leaf': 2, 'criterion': 'gini'}. Best is trial 13 with value: 0.8612.


Время: 2.97 минут


[I 2024-08-22 20:45:57,546] Trial 30 finished with value: 0.8527 and parameters: {'n_estimators': 509, 'max_depth': 25, 'min_samples_split': 19, 'min_samples_leaf': 6, 'criterion': 'entropy'}. Best is trial 13 with value: 0.8612.


Время: 1.95 минут


[I 2024-08-22 20:51:20,491] Trial 31 finished with value: 0.8587 and parameters: {'n_estimators': 943, 'max_depth': 30, 'min_samples_split': 13, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 5.38 минут


[I 2024-08-22 20:56:16,684] Trial 32 finished with value: 0.8591 and parameters: {'n_estimators': 931, 'max_depth': 28, 'min_samples_split': 11, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.94 минут


[I 2024-08-22 21:01:12,538] Trial 33 finished with value: 0.8562 and parameters: {'n_estimators': 995, 'max_depth': 28, 'min_samples_split': 14, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.93 минут


[I 2024-08-22 21:05:11,276] Trial 34 finished with value: 0.8585 and parameters: {'n_estimators': 822, 'max_depth': 29, 'min_samples_split': 15, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 3.98 минут


[I 2024-08-22 21:08:35,874] Trial 35 finished with value: 0.8533 and parameters: {'n_estimators': 924, 'max_depth': 26, 'min_samples_split': 12, 'min_samples_leaf': 8, 'criterion': 'gini'}. Best is trial 13 with value: 0.8612.


Время: 3.41 минут


[I 2024-08-22 21:12:24,838] Trial 36 finished with value: 0.86 and parameters: {'n_estimators': 746, 'max_depth': 28, 'min_samples_split': 17, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 3.82 минут


[I 2024-08-22 21:15:07,874] Trial 37 finished with value: 0.8538 and parameters: {'n_estimators': 763, 'max_depth': 23, 'min_samples_split': 18, 'min_samples_leaf': 4, 'criterion': 'gini'}. Best is trial 13 with value: 0.8612.


Время: 2.72 минут


[I 2024-08-22 21:18:39,666] Trial 38 finished with value: 0.8572 and parameters: {'n_estimators': 701, 'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 3.53 минут


[I 2024-08-22 21:20:12,882] Trial 39 finished with value: 0.852 and parameters: {'n_estimators': 363, 'max_depth': 27, 'min_samples_split': 17, 'min_samples_leaf': 4, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 1.55 минут


[I 2024-08-22 21:22:47,013] Trial 40 finished with value: 0.8565 and parameters: {'n_estimators': 541, 'max_depth': 29, 'min_samples_split': 19, 'min_samples_leaf': 3, 'criterion': 'entropy'}. Best is trial 13 with value: 0.8612.


Время: 2.57 минут


[I 2024-08-22 21:27:34,598] Trial 41 finished with value: 0.8596 and parameters: {'n_estimators': 849, 'max_depth': 30, 'min_samples_split': 16, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.79 минут


[I 2024-08-22 21:32:16,288] Trial 42 finished with value: 0.8591 and parameters: {'n_estimators': 848, 'max_depth': 29, 'min_samples_split': 16, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.69 минут


[I 2024-08-22 21:35:53,127] Trial 43 finished with value: 0.8561 and parameters: {'n_estimators': 760, 'max_depth': 26, 'min_samples_split': 9, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 3.61 минут


[I 2024-08-22 21:37:33,796] Trial 44 finished with value: 0.8504 and parameters: {'n_estimators': 688, 'max_depth': 14, 'min_samples_split': 16, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 1.68 минут


[I 2024-08-22 21:39:57,007] Trial 45 finished with value: 0.8555 and parameters: {'n_estimators': 406, 'max_depth': 30, 'min_samples_split': 18, 'min_samples_leaf': 1, 'criterion': 'gini'}. Best is trial 13 with value: 0.8612.


Время: 2.39 минут


[I 2024-08-22 21:44:21,135] Trial 46 finished with value: 0.8577 and parameters: {'n_estimators': 877, 'max_depth': 27, 'min_samples_split': 7, 'min_samples_leaf': 2, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 4.40 минут


[I 2024-08-22 21:49:23,767] Trial 47 finished with value: 0.8588 and parameters: {'n_estimators': 959, 'max_depth': 28, 'min_samples_split': 15, 'min_samples_leaf': 1, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 5.04 минут


[I 2024-08-22 21:51:21,954] Trial 48 finished with value: 0.8566 and parameters: {'n_estimators': 434, 'max_depth': 29, 'min_samples_split': 17, 'min_samples_leaf': 5, 'criterion': 'log_loss'}. Best is trial 13 with value: 0.8612.


Время: 1.97 минут


[I 2024-08-22 21:53:40,939] Trial 49 finished with value: 0.8554 and parameters: {'n_estimators': 598, 'max_depth': 22, 'min_samples_split': 10, 'min_samples_leaf': 2, 'criterion': 'entropy'}. Best is trial 13 with value: 0.8612.


Время: 2.32 минут


In [None]:
print("Best trial:")
trial = study.best_trial

print(f"  Accuracy: {trial.value}")
print("  Best hyperparameters: ", trial.params)

best_model = RandomForestClassifier(**trial.params, random_state=0)
best_model.fit(X_train, y_train)

y_train_pred = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Accuracy(train) = {train_accuracy}')

y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Accuracy(test) = {test_accuracy}')

Best trial:
  Accuracy: 0.8612
  Best hyperparameters:  {'n_estimators': 983, 'max_depth': 30, 'min_samples_split': 17, 'min_samples_leaf': 1, 'criterion': 'log_loss'}
Accuracy(train) = 0.94625
Accuracy(test) = 0.8612


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
svc = SVC()
start_time = time.time()
# Определение параметров для поиска
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)
print("Наилучший результат:", grid_search.best_score_)

best_svc = grid_search.best_estimator_

y_pred = best_svc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

end_time = time.time()
print(f'Время: {(end_time - start_time)/60:.2f} минут')

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Лучшие параметры: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Наилучший результат: 0.8967999248728781
Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.92      0.91      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4416  545]
 [ 416 4623]]
Время: 150.49 минут


### CountVectorizer

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_texts)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2, random_state=42)

In [None]:
# Обучение модели логистической регрессии
clf = LogisticRegression(max_iter=100)
clf.fit(X_train, y_train)



# Предсказание меток для тестовых данных
y_pred = clf.predict(X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      4961
           1       0.88      0.89      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

[[4359  602]
 [ 557 4482]]


In [None]:
model = RandomForestClassifier(n_estimators=1000,
                              criterion='log_loss',
                              max_depth=20,
                              min_samples_split=10,
                              min_samples_leaf=1,
                              random_state = 0)


model.fit(X_train, y_train)

y_pred = model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f'Accuracy(train) = {accuracy}')

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy(test) = {accuracy}')

Accuracy(train) = 0.911275
Accuracy(test) = 0.8641


In [None]:
def objective(trial):
    start_time = time.time()
    # Определяем гиперпараметры, которые будет оптимизировать Optuna
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])

    # Создаем модель RandomForestClassifier с подобранными параметрами
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=0
    )

    # Обучаем модель
    model.fit(X_train, y_train)

    # Предсказываем и оцениваем точность на тестовой выборке
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    end_time = time.time()
    print(f'Время: {(end_time - start_time)/60:.2f} минут')

    return accuracy

# Создаем объект Optuna и запускаем оптимизацию
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)



[I 2024-08-23 11:23:26,429] A new study created in memory with name: no-name-4e09598f-3d9c-4c13-b3b7-312d4af330b4
[I 2024-08-23 11:26:16,477] Trial 0 finished with value: 0.8613 and parameters: {'n_estimators': 897, 'max_depth': 19, 'min_samples_split': 20, 'min_samples_leaf': 1, 'criterion': 'gini'}. Best is trial 0 with value: 0.8613.


Время: 2.83 минут


[I 2024-08-23 11:27:57,995] Trial 1 finished with value: 0.8582 and parameters: {'n_estimators': 644, 'max_depth': 18, 'min_samples_split': 17, 'min_samples_leaf': 7, 'criterion': 'log_loss'}. Best is trial 0 with value: 0.8613.


Время: 1.69 минут


[I 2024-08-23 11:29:51,229] Trial 2 finished with value: 0.8604 and parameters: {'n_estimators': 578, 'max_depth': 24, 'min_samples_split': 4, 'min_samples_leaf': 9, 'criterion': 'gini'}. Best is trial 0 with value: 0.8613.


Время: 1.89 минут


[I 2024-08-23 11:30:55,118] Trial 3 finished with value: 0.8575 and parameters: {'n_estimators': 288, 'max_depth': 26, 'min_samples_split': 18, 'min_samples_leaf': 6, 'criterion': 'gini'}. Best is trial 0 with value: 0.8613.


Время: 1.06 минут


[I 2024-08-23 11:31:25,692] Trial 4 finished with value: 0.8485 and parameters: {'n_estimators': 441, 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 0 with value: 0.8613.


Время: 0.51 минут


[I 2024-08-23 11:31:59,690] Trial 5 finished with value: 0.85 and parameters: {'n_estimators': 445, 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 7, 'criterion': 'log_loss'}. Best is trial 0 with value: 0.8613.


Время: 0.57 минут


[I 2024-08-23 11:32:46,748] Trial 6 finished with value: 0.8559 and parameters: {'n_estimators': 288, 'max_depth': 19, 'min_samples_split': 7, 'min_samples_leaf': 7, 'criterion': 'gini'}. Best is trial 0 with value: 0.8613.


Время: 0.78 минут


[I 2024-08-23 11:33:17,174] Trial 7 finished with value: 0.8553 and parameters: {'n_estimators': 158, 'max_depth': 21, 'min_samples_split': 20, 'min_samples_leaf': 3, 'criterion': 'gini'}. Best is trial 0 with value: 0.8613.


Время: 0.51 минут


[I 2024-08-23 11:34:15,601] Trial 8 finished with value: 0.8533 and parameters: {'n_estimators': 980, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 10, 'criterion': 'log_loss'}. Best is trial 0 with value: 0.8613.


Время: 0.97 минут


[I 2024-08-23 11:34:53,999] Trial 9 finished with value: 0.8559 and parameters: {'n_estimators': 401, 'max_depth': 11, 'min_samples_split': 10, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 0 with value: 0.8613.


Время: 0.64 минут


[I 2024-08-23 11:40:11,467] Trial 10 finished with value: 0.8681 and parameters: {'n_estimators': 990, 'max_depth': 30, 'min_samples_split': 14, 'min_samples_leaf': 1, 'criterion': 'entropy'}. Best is trial 10 with value: 0.8681.


Время: 5.29 минут


[I 2024-08-23 11:45:27,549] Trial 11 finished with value: 0.8683 and parameters: {'n_estimators': 985, 'max_depth': 30, 'min_samples_split': 14, 'min_samples_leaf': 1, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8683.


Время: 5.27 минут


[I 2024-08-23 11:49:38,021] Trial 12 finished with value: 0.8674 and parameters: {'n_estimators': 777, 'max_depth': 30, 'min_samples_split': 14, 'min_samples_leaf': 1, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8683.


Время: 4.17 минут


[I 2024-08-23 11:53:28,704] Trial 13 finished with value: 0.8653 and parameters: {'n_estimators': 791, 'max_depth': 29, 'min_samples_split': 13, 'min_samples_leaf': 2, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8683.


Время: 3.84 минут


[I 2024-08-23 11:55:30,645] Trial 14 finished with value: 0.8575 and parameters: {'n_estimators': 997, 'max_depth': 14, 'min_samples_split': 15, 'min_samples_leaf': 4, 'criterion': 'entropy'}. Best is trial 11 with value: 0.8683.


Время: 2.03 минут


In [None]:
print("Best trial:")
trial = study.best_trial

print(f"  Accuracy: {trial.value}")
print("  Best hyperparameters: ", trial.params)

best_model = RandomForestClassifier(**trial.params, random_state=0)
best_model.fit(X_train, y_train)

y_train_pred = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Accuracy(train) = {train_accuracy}')

y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Accuracy(test) = {test_accuracy}')

Best trial:
  Accuracy: 0.8683
  Best hyperparameters:  {'n_estimators': 985, 'max_depth': 30, 'min_samples_split': 14, 'min_samples_leaf': 1, 'criterion': 'entropy'}
Accuracy(train) = 0.9367
Accuracy(test) = 0.8683
