In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, ParameterGrid

# Load the training and testing datasets
df_train = pd.read_parquet('G:/Facultate/Practica/child-mind-institute-detect-sleep-states/train_series1.parquet')
df_test = pd.read_parquet('test_series_sub2.parquet')

# Define the features available in the training dataset
features = ['mean_enmo1', 'mean_anglez1', 'std_enmo1', 'std_anglez1', 'cv_enmo1', 'cv_anglez1', 'skewness_enmo1', 'skewness_anglez1', 'kurtosis_enmo1', 'kurtosis_anglez1', 'median_enmo1', 'median_anglez1', 'min_enmo1', 'min_anglez1', 'max_enmo1', 'max_anglez1', 'corr_enmo_anglez']

# Check if all features exist in the training dataset
existing_features = [feature for feature in features if feature in df_train.columns]

# Separate features and target variable for training dataset
X = df_train[existing_features]
y = df_train['event']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fill missing values
X_train = X_train.fillna(X_train.mean())
X_val = X_val.fillna(X_val.mean())
X_test = df_test[existing_features].fillna(df_test[existing_features].mean())

# Apply SMOTE to the training dataset
smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train, y_train)

# Define parameter grids for both classifiers
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Function to evaluate a model
def evaluate_model(model, X, y):
    predictions = model.predict(X)
    accuracy = accuracy_score(y, predictions)
    precision = precision_score(y, predictions, average='weighted')
    recall = recall_score(y, predictions, average='weighted')
    f1 = f1_score(y, predictions, average='weighted')
    return accuracy, precision, recall, f1

# Function to perform grid search without cross-validation
def grid_search_no_cv(clf, param_grid, X_train, y_train, X_val, y_val):
    best_score = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        clf.set_params(**params)
        clf.fit(X_train, y_train)
        accuracy, precision, recall, f1 = evaluate_model(clf, X_val, y_val)
        if accuracy > best_score:
            best_score = accuracy
            best_params = params
    return best_params, best_score

# Perform grid search for RandomForestClassifier
best_rf_params, best_rf_score = grid_search_no_cv(RandomForestClassifier(), rf_param_grid, X_res, y_res, X_val, y_val)
best_rf_clf = RandomForestClassifier(**best_rf_params)
best_rf_clf.fit(X_res, y_res)

# Perform grid search for GradientBoostingClassifier
best_gb_params, best_gb_score = grid_search_no_cv(GradientBoostingClassifier(), gb_param_grid, X_res, y_res, X_val, y_val)
best_gb_clf = GradientBoostingClassifier(**best_gb_params)
best_gb_clf.fit(X_res, y_res)

print("Best Random Forest Parameters:", best_rf_params)
print("Best Gradient Boosting Parameters:", best_gb_params)

# Make predictions on the test dataset using the best estimators
rf_pred = best_rf_clf.predict(X_test)
gb_pred = best_gb_clf.predict(X_test)

# Add the predictions to the test dataset
df_test['rf_predicted_event'] = rf_pred
df_test['gb_predicted_event'] = gb_pred

# Save the test dataset with predictions
df_test.to_parquet('test_series_with_predictions3.parquet')

# Optionally, print the first few rows of the test dataset with predictions
#print(df_test.head())

# Random Forest Scores
rf_predictions = best_rf_clf.predict(X_train)
rf_accuracy = accuracy_score(y_train, rf_predictions)
rf_precision = precision_score(y_train, rf_predictions, average='weighted')
rf_recall = recall_score(y_train, rf_predictions, average='weighted')
rf_f1 = f1_score(y_train, rf_predictions, average='weighted')

print("Random Forest Scores:")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print(f"F1 Score: {rf_f1}")

# Gradient Boosting Scores
gb_predictions = best_gb_clf.predict(X_train)
gb_accuracy = accuracy_score(y_train, gb_predictions)
gb_precision = precision_score(y_train, gb_predictions, average='weighted')
gb_recall = recall_score(y_train, gb_predictions, average='weighted')
gb_f1 = f1_score(y_train, gb_predictions, average='weighted')

print("Gradient Boosting Scores:")
print(f"Accuracy: {gb_accuracy}")
print(f"Precision: {gb_precision}")
print(f"Recall: {gb_recall}")
print(f"F1 Score: {gb_f1}")

Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Gradient Boosting Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest Scores:
Accuracy: 0.9046080681054025
Precision: 0.9048673401891056
Recall: 0.9046080681054025
F1 Score: 0.9046867797298302
Gradient Boosting Scores:
Accuracy: 0.5947784009372812
Precision: 0.6051094934830182
Recall: 0.5947784009372812
F1 Score: 0.5959270609892057
