In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# Load the training and testing datasets
df_train = pd.read_parquet('G:/Facultate/Practica/child-mind-institute-detect-sleep-states/train_series1.parquet')
df_test = pd.read_parquet('test_series_sub2.parquet')

# Define the features available in the training dataset
features = ['mean_enmo1', 'mean_anglez1', 'std_enmo1', 'std_anglez1', 'cv_enmo1', 'cv_anglez1', 'skewness_enmo1', 'skewness_anglez1', 'kurtosis_enmo1', 'kurtosis_anglez1', 'median_enmo1', 'median_anglez1', 'min_enmo1', 'min_anglez1', 'max_enmo1', 'max_anglez1', 'corr_enmo_anglez']

# Check if all features exist in the training dataset
existing_features = [feature for feature in features if feature in df_train.columns]

# Separate features and target variable for training dataset
X_train = df_train[existing_features]
y_train = df_train['event']

X_train = X_train.fillna(X_train.mean())
# Separate features for testing dataset
X_test = df_test[existing_features]

# Apply SMOTE to the training dataset
smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train, y_train)

# Define parameter grids for both classifiers
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize the classifiers
rf_clf = RandomForestClassifier()
gb_clf = GradientBoostingClassifier()

# Initialize GridSearchCV for both classifiers
rf_grid_search = GridSearchCV(estimator=rf_clf, param_grid=rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
gb_grid_search = GridSearchCV(estimator=gb_clf, param_grid=gb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to the resampled training dataset
rf_grid_search.fit(X_res, y_res)
gb_grid_search.fit(X_res, y_res)

# Retrieve the best parameters and scores
best_rf_clf = rf_grid_search.best_estimator_
best_gb_clf = gb_grid_search.best_estimator_

print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Gradient Boosting Parameters:", gb_grid_search.best_params_)

# Make predictions on the test dataset using the best estimators
rf_pred = best_rf_clf.predict(X_test)
gb_pred = best_gb_clf.predict(X_test)

# Add the predictions to the test dataset
df_test['rf_predicted_event'] = rf_pred
df_test['gb_predicted_event'] = gb_pred

# Save the test dataset with predictions
df_test.to_parquet('test_series_with_predictions3.parquet')

# Optionally, print the first few rows of the test dataset with predictions
#print(df_test.head())

# Random Forest Scores
rf_predictions = best_rf_clf.predict(X_train)
rf_accuracy = accuracy_score(y_train, rf_predictions)
rf_precision = precision_score(y_train, rf_predictions, average='weighted')
rf_recall = recall_score(y_train, rf_predictions, average='weighted')
rf_f1 = f1_score(y_train, rf_predictions, average='weighted')

print("Random Forest Scores:")
print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print(f"F1 Score: {rf_f1}")

# Gradient Boosting Scores
gb_predictions = best_gb_clf.predict(X_train)
gb_accuracy = accuracy_score(y_train, gb_predictions)
gb_precision = precision_score(y_train, gb_predictions, average='weighted')
gb_recall = recall_score(y_train, gb_predictions, average='weighted')
gb_f1 = f1_score(y_train, gb_predictions, average='weighted')

print("Gradient Boosting Scores:")
print(f"Accuracy: {gb_accuracy}")
print(f"Precision: {gb_precision}")
print(f"Recall: {gb_recall}")

Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Gradient Boosting Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest Scores:
Accuracy: 0.9045630065096675
Precision: 0.9048436046615747
Recall: 0.9045630065096675
F1 Score: 0.9046461168325169
Gradient Boosting Scores:
Accuracy: 0.5953988644477874
Precision: 0.6056284494680293
Recall: 0.5953988644477874
