In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# 1. Data uploading
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')
sample_submission_df = pd.read_csv('sample_submition.csv')

print(f"Размер обучающей выборки: {train_df.shape}")
print(f"Размер тестовой выборки: {test_df.shape}")

test_ids = test_df['id']

train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

# 2. Preprocessing
categorical_features = ['feature_34', 'feature_35', 'feature_36', 'feature_37']
numerical_features = [col for col in train_df.columns if col not in categorical_features + ['target']]


train_df.replace(-999, np.nan, inplace=True)
test_df.replace(-999, np.nan, inplace=True)

for col in numerical_features:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)

for col in categorical_features:
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    test_df[col].fillna(mode_val, inplace=True)
    
# One-Hot Encoding
train_df = pd.get_dummies(train_df, columns=categorical_features, dummy_na=False)
test_df = pd.get_dummies(test_df, columns=categorical_features, dummy_na=False)

train_labels = train_df['target']
train_ids = train_df.index
test_ids_indices = test_df.index

y = train_df['target']
X = train_df.drop('target', axis=1)

train_cols = X.columns
test_cols = test_df.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    test_df[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0

test_df = test_df[train_cols] # Упорядочиваем столбцы в тестовой выборке так же, как в обучающей

print("Категориальные признаки преобразованы.")
print(f"Итоговое количество признаков после обработки: {len(X.columns)}")

# 3. Model training
print("\nНачало обучения модели LightGBM...")

# Using LightGBM
lgb_clf = lgb.LGBMClassifier(random_state=42, n_estimators=200, learning_rate=0.05, num_leaves=31)

lgb_clf.fit(X, y)

print("Модель успешно обучена.")

# 4. Model testing

predictions = lgb_clf.predict(test_df)

# 5. Creating file

submission_df = pd.DataFrame({'id': test_ids, 'target': predictions})
submission_df.to_csv('submission.csv', index=False)

print("\nФайл 'submission.csv' успешно создан и сохранен.")

FileNotFoundError: [Errno 2] No such file or directory: 'sample_submition.csv'

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.stats import loguniform
from sklearn.model_selection import ParameterSampler

TIME_LIMIT_SECONDS = 10 * 60  # 5 минут

# 1. Data uploading and preprocessing
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')

test_ids = test_df['id']
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)
categorical_features = ['feature_34', 'feature_35', 'feature_36', 'feature_37']

train_df.replace(-999, np.nan, inplace=True)
test_df.replace(-999, np.nan, inplace=True)
numerical_features = [col for col in train_df.columns if col not in categorical_features + ['target']]
for col in numerical_features:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)
for col in categorical_features:
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    test_df[col].fillna(mode_val, inplace=True)
train_df = pd.get_dummies(train_df, columns=categorical_features, dummy_na=False)
test_df = pd.get_dummies(test_df, columns=categorical_features, dummy_na=False)
y = train_df['target']
X = train_df.drop('target', axis=1)
train_cols = X.columns
test_cols = test_df.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test: test_df[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train: X[c] = 0
test_df = test_df[train_cols]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

# 2. Selecting best features
K_BEST_FEATURES = 40

selector = SelectKBest(score_func=f_classif, k=K_BEST_FEATURES)

X_scaled_selected = selector.fit_transform(X_scaled, y)

test_scaled_selected = selector.transform(test_scaled)

print(f"Размерность данных уменьшена с {X_scaled.shape[1]} до {X_scaled_selected.shape[1]} признаков.")


# 3. Finding parameters

param_dist = {'C': loguniform(0.01, 100)} # Убираем gamma
param_sampler = ParameterSampler(param_dist, n_iter=200, random_state=42)
cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
best_score = -1.0
best_params = None
start_time = time.time()

for i, params in enumerate(param_sampler):
    if time.time() - start_time > TIME_LIMIT_SECONDS:
        print("\nЛимит времени исчерпан.")
        break
    
    print(f"Итерация {i+1}...")
    model = SVC(kernel='linear', class_weight='balanced', **params) # Меняем ядро
    
    # Оцениваем модель на данных с отобранными признаками
    score = np.mean(cross_val_score(model, X_scaled_selected, y, cv=cv_strategy, scoring='accuracy', n_jobs=-1))
    
    if score > best_score:
        best_score = score
        best_params = params
        print(f"  Новый лучший результат: {score:.5f} с параметрами {params}")

# 4. Model training and testing
if best_params:
    final_model = SVC(kernel='rbf', class_weight='balanced', **best_params, random_state=42)
    final_model.fit(X_scaled_selected, y)
    
    predictions = final_model.predict(test_scaled_selected)
    
    submission_df = pd.DataFrame({'id': test_ids, 'target': predictions})
    submission_df.to_csv('submission_svm_feature_selection.csv', index=False)
    print("\nГотово. Файл 'submission_svm_feature_selection.csv' создан.")
else:
    print("Не удалось завершить ни одной итерации поиска.")

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb

# 1. Data uploading
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')


test_ids = test_df['id']
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

# 2. Data preprocessing
categorical_features = ['feature_34', 'feature_35', 'feature_36', 'feature_37']

train_df.replace(-999, np.nan, inplace=True)
test_df.replace(-999, np.nan, inplace=True)

numerical_features = [col for col in train_df.columns if col not in categorical_features + ['target']]
for col in numerical_features:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)
for col in categorical_features:
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    test_df[col].fillna(mode_val, inplace=True)

train_df = pd.get_dummies(train_df, columns=categorical_features, dummy_na=False)
test_df = pd.get_dummies(test_df, columns=categorical_features, dummy_na=False)

y = train_df['target']
X = train_df.drop('target', axis=1)

train_cols = X.columns
test_cols = test_df.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test: test_df[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train: X[c] = 0
test_df = test_df[train_cols]

# 3. Model training
print("\nНачало обучения модели XGBoost...")


xgb_clf = xgb.XGBClassifier(
    n_estimators=300,        
    learning_rate=0.05,      
    max_depth=5,             
    subsample=0.8,           
    colsample_bytree=0.8,    
    use_label_encoder=False, 
    eval_metric='logloss',   
    random_state=42,
    n_jobs=-1                
)


xgb_clf.fit(X, y)

print("Модель XGBoost успешно обучена.")

# 4. Model training
predictions = xgb_clf.predict(test_df)

submission_df = pd.DataFrame({'id': test_ids, 'target': predictions})
submission_df.to_csv('submission_xgboost.csv', index=False)

print("\nГотово. Файл 'submission_xgboost.csv' успешно создан.")

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

# 1. Data uploading
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')


test_ids = test_df['id']
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

# 2. Data preprocessing
categorical_features = ['feature_34', 'feature_35', 'feature_36', 'feature_37']

train_df.replace(-999, np.nan, inplace=True)
test_df.replace(-999, np.nan, inplace=True)

numerical_features = [col for col in train_df.columns if col not in categorical_features + ['target']]
for col in numerical_features:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)
for col in categorical_features:
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    test_df[col].fillna(mode_val, inplace=True)

train_df = pd.get_dummies(train_df, columns=categorical_features, dummy_na=False)
test_df = pd.get_dummies(test_df, columns=categorical_features, dummy_na=False)

y = train_df['target']
X = train_df.drop('target', axis=1)

train_cols = X.columns
test_cols = test_df.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test: test_df[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train: X[c] = 0
test_df = test_df[train_cols]

# 3. Models training

# --- Model 1: LightGBM ---
print("\nОбучение модели LightGBM...")
lgbm_clf = lgb.LGBMClassifier(random_state=42, n_estimators=200, learning_rate=0.05, num_leaves=31)
lgbm_clf.fit(X, y)
print("Модель LightGBM обучена.")

# --- Model 2: XGBoost ---
print("\nОбучение модели XGBoost...")
xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb_clf.fit(X, y)
print("Модель XGBoost обучена.")


# 4. Analyzing results from 2 models

lgbm_probs = lgbm_clf.predict_proba(test_df)[:, 1]
xgb_probs = xgb_clf.predict_proba(test_df)[:, 1]

averaged_probs = (lgbm_probs + xgb_probs) / 2
print("Вероятности усреднены.")


# Final prob > 0.5 -> 1, in other way -> 0
final_predictions = (averaged_probs > 0.5).astype(int)

submission_df = pd.DataFrame({'id': test_ids, 'target': final_predictions})
submission_df.to_csv('submission_ensemble_lgbm_xgb.csv', index=False)

print("\nГотово. Файл 'submission_ensemble_lgbm_xgb.csv' успешно создан.")

In [2]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import ParameterSampler

TIME_LIMIT_SECONDS = 10 * 60 
MAX_ITERATIONS = 50          
# 1. Data uploading
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')

test_ids = test_df['id']
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)
categorical_features = ['feature_34', 'feature_35', 'feature_36', 'feature_37']

train_df.replace(-999, np.nan, inplace=True)
test_df.replace(-999, np.nan, inplace=True)
numerical_features = [col for col in train_df.columns if col not in categorical_features + ['target']]
for col in numerical_features:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)
for col in categorical_features:
    mode_val = train_df[col].mode()[0]
    train_df[col].fillna(mode_val, inplace=True)
    test_df[col].fillna(mode_val, inplace=True)
train_df = pd.get_dummies(train_df, columns=categorical_features, dummy_na=False)
test_df = pd.get_dummies(test_df, columns=categorical_features, dummy_na=False)
y = train_df['target']
X = train_df.drop('target', axis=1)
train_cols = X.columns
test_cols = test_df.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test: test_df[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train: X[c] = 0
test_df = test_df[train_cols]

# 2. Finding parameters for LightGBM

param_dist = {
    'n_estimators': [500, 700, 1000],       
    'learning_rate': [0.01, 0.02, 0.03],    
    'num_leaves': [25, 31, 35],             
    'max_depth': [-1, 10, 15],
    'subsample': [0.8, 0.85, 0.9],
    'colsample_bytree': [0.8, 0.85, 0.9]
}

param_sampler = ParameterSampler(param_dist, n_iter=MAX_ITERATIONS, random_state=42)
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_score = -1.0
best_params = None
start_time = time.time()

for i, params in enumerate(param_sampler):
    if time.time() - start_time > TIME_LIMIT_SECONDS:
        print("\nЛимит времени исчерпан.")
        break

    print(f"\nИтерация {i + 1}/{MAX_ITERATIONS} (прошло {time.time() - start_time:.0f} сек)")
    print(f"  Пробуем параметры: {params}")
    
    model = lgb.LGBMClassifier(random_state=42, n_jobs=-1, **params)
    
    score = np.mean(cross_val_score(model, X, y, cv=cv_strategy, scoring='accuracy'))
    
    print(f"  Accuracy на CV: {score:.5f}")

    if score > best_score:
        best_score = score
        best_params = params

# 3. Model training
if best_params:
    print("\nПоиск завершен")
    print(f"Лучший результат на кросс-валидации: {best_score:.5f}")
    print(f"Лучшие найденные параметры: {best_params}")

    print("\nОбучение финальной модели...")
    final_model = lgb.LGBMClassifier(random_state=42, n_jobs=-1, **best_params)
    final_model.fit(X, y)
    
    predictions = final_model.predict(test_df)
    
    submission_df = pd.DataFrame({'id': test_ids, 'target': predictions})
    submission_df.to_csv('submission_lgbm_final_tuned.csv', index=False)
    print("\nГотово. Файл 'submission_lgbm_final_tuned.csv' создан.")
else:
    print("Не удалось завершить ни одной итерации.")


Итерация 1/50 (прошло 0 сек)
  Пробуем параметры: {'subsample': 0.8, 'num_leaves': 25, 'n_estimators': 700, 'max_depth': 15, 'learning_rate': 0.03, 'colsample_bytree': 0.85}
[LightGBM] [Info] Number of positive: 3976, number of negative: 6786
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7352
[LightGBM] [Info] Number of data points in the train set: 10762, number of used features: 101
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.369448 -> initscore=-0.534585
[LightGBM] [Info] Start training from score -0.534585
[LightGBM] [Info] Number of positive: 3976, number of negative: 6786
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7352
[LightGBM] [Info] Number of data points in the train set: 10762, nu