In [110]:
import pandas as pd
import random

# Создаем случайные данные
data = {'salary': [random.randint(30000, 80000) for _ in range(1000)],
        'city': [random.choice(['Bishkek', 'London', 'Moscow']) for _ in range(1000)],
        'age': [random.randint(30, 65) for _ in range(1000)],
        'vacation_prefer': [random.choice(['Shopping', 'Beach holiday']) for _ in range(1000)],
        'transport_prefer': [random.choice(['auto', 'plane']) for _ in range(1000)],
        'target': [random.choice(['London', 'Moscow']) for _ in range(1000)]}

df = pd.DataFrame(data)


In [111]:
df_encoded = pd.get_dummies(df, columns=['city', 'vacation_prefer', 'transport_prefer'], drop_first=True)

In [112]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [113]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()


In [114]:
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print(f'Model Accuracy: {accuracy:.2f}')


Model Accuracy: 0.51


In [115]:
all_cities = ['city_Bishkek', 'city_London', 'city_Moscow']

random_data = {
    'salary': [random.randint(30000, 80000)],
    'age': [random.randint(30, 65)],
    'vacation_prefer_Shopping': [1],
    'transport_prefer_plane': [1]
}

random_data.update({city: [0] for city in all_cities})

random_df = pd.DataFrame(random_data)

random_df = random_df[X.columns]

prediction = model.predict(random_df)
print(f'Predicted Target: {prediction[0]}')


Predicted Target: London


In [116]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, GridSearchCV

# Выбираем числовых признаков для стандартизации
numeric_features = ['salary', 'age']

scaler = MinMaxScaler()

# Применение MinMaxScaling к обучающей и тестовой выборкам
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# после стандартизации
model.fit(X_train, y_train)
accuracy_after_scaling = model.score(X_test, y_test)
print(f'Model Accuracy after Scaling: {accuracy_after_scaling:.2f}')

# Обновленное предсказание на случайных данных после стандартизации
random_df[numeric_features] = scaler.transform(random_df[numeric_features])
prediction_after_scaling = model.predict(random_df)
print(f'Predicted Target after Scaling: {prediction_after_scaling[0]}')

# Обновленная перекрестная проверка
cv_scores_after_scaling = cross_val_score(model, X_train, y_train, cv=5)
mean_cv_score_after_scaling = cv_scores_after_scaling.mean()

param_grid = {'n_estimators': [50, 100, 200],
              'max_depth': [None, 10, 20],
              'min_samples_split': [2, 5, 10]}

# Обновленный решетчатый поиск
grid_search_after_scaling = GridSearchCV(model, param_grid, cv=5)
grid_search_after_scaling.fit(X_train, y_train)

print(f'Mean Cross-Validation Score after Scaling: {mean_cv_score_after_scaling:.2f}')
print(f'Best Parameters after Scaling: {grid_search_after_scaling.best_params_}')

# Оценка модели с лучшими параметрами после стандартизации
best_model_after_scaling = grid_search_after_scaling.best_estimator_
best_accuracy_after_scaling = best_model_after_scaling.score(X_test, y_test)
print(f'Improved Model Accuracy after Scaling: {best_accuracy_after_scaling:.2f}')


Model Accuracy after Scaling: 0.49
Predicted Target after Scaling: London
Mean Cross-Validation Score after Scaling: 0.51
Best Parameters after Scaling: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Improved Model Accuracy after Scaling: 0.48


<h1>Результат</h1>
Особо не получилось улучшить модель с помощью масштабирования, решетчатого поиска, перекрестной проверки из-за качества исходных данных