In [3]:
!pip install xgboost
!pip install -U scikit-learn
!pip install -U imbalanced-learn

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/150.0 MB 2.4 MB/s eta 0:01:04
   ---------------------------------------- 1.0/150.0 MB 1.9 MB/s eta 0:01:21
   ---------------------------------------- 1.8/150.0 MB 2.3 MB/s eta 0:01:04
    --------------------------------------- 2.4/150.0 MB 2.4 MB/s eta 0:01:03
    --------------------------------------- 2.9/150.0 MB 2.4 MB/s eta 0:01:02
    --------------------------------------- 3.1/150.0 MB 2.4 MB/s eta 0:01:02
    --------------------------------------- 3.7/150.0 MB 2.4 MB/s eta 0:01:02
   - -------------------------------------- 4.2/150.0 MB 2.4 MB/s eta 0:01:02
   - -------------------------------------- 4.7/150.0 MB 2.4 MB/s eta 0:01:01
   - ---

In [16]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
import xgboost as xgb

# Define the correct file paths for the engineered datasets
file_paths = [
    r"C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_all_outliers_dropped.csv",
    r"C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_2_or_more_outliers_dropped.csv",
    r"C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_3_or_more_outliers_dropped.csv"
]

# Create directories for saving results
plots_dir = r"C:\Users\reicd\Downloads\MY_ML_PROJECT\plots\F1 score default algorithms"
os.makedirs(plots_dir, exist_ok=True)

results_dir = r"C:\Users\reicd\Downloads\MY_ML_PROJECT\tests\f1 scores default table"
os.makedirs(results_dir, exist_ok=True)

# List of models to evaluate
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "Support Vector Machine": SVC(),
    "Extra Trees": ExtraTreesClassifier(),
    "Naive Bayes": GaussianNB(),
    "Perceptron": Perceptron()
}

# Initialize results dictionary
f1_scores = {model: [] for model in models.keys()}

# Loop through each dataset
for file_path in file_paths:
    # Read the dataset
    df = pd.read_csv(file_path)

    # Print the column names to check if 'Phase' is present
    print(f"Columns in {file_path}: {df.columns.tolist()}")

    # Set the target column to 'Phase'
    target_column = 'Phase'
    
    # Check unique values in target column
    unique_classes = df[target_column].unique()
    print(f"Unique classes in {target_column}: {unique_classes}")

    # Remap the target classes if needed (map to 0-based integer labels)
    df[target_column] = pd.factorize(df[target_column])[0]

    # Split features and target variable
    X = df.drop(columns=target_column)
    y = df[target_column]

    # Initialize KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)  # You can change n_splits to 10 for 10-fold

    # Evaluate each model using k-fold cross-validation
    for model_name, model in models.items():
        f1_fold_scores = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            f1 = f1_score(y_test, y_pred, average='weighted')  # use 'weighted' F1 score
            f1_fold_scores.append(f1)

        # Store the average F1 score across folds
        f1_scores[model_name].append(sum(f1_fold_scores) / len(f1_fold_scores))

# Create a DataFrame for results
f1_scores_df = pd.DataFrame(f1_scores, index=[f'Dataset {i + 1}' for i in range(len(file_paths))])

# Save F1 scores table
table_file_path = os.path.join(results_dir, "f1_scores_kfold.csv")
f1_scores_df.to_csv(table_file_path)

print(f"F1 scores with k-fold cross-validation saved at: {table_file_path}")

Columns in C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_all_outliers_dropped.csv: ['lhx', 'lhy', 'rhx', 'rhy', 'hy', 'hz', 'sy', 'timestamp', '1', '3', '4', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', 'Phase', 'Subject_A', 'Subject_B', 'Subject_C', 'Story_1', 'Story_2', 'Story_3', 'sy_timestamp_product', 'sy_timestamp_sum', 'sy_timestamp_difference', 'sy_rhx_product', 'sy_rhx_sum', 'sy_rhx_difference', 'sy_rhy_product', 'sy_rhy_sum', 'sy_rhy_difference', 'sy_hz_product', 'sy_hz_sum', 'sy_hz_difference', 'sy_lhx_product', 'sy_lhx_sum', 'sy_lhx_difference', 'sy_lhy_product', 'sy_lhy_sum', 'sy_lhy_difference', 'timestamp_rhx_product', 'timestamp_rhx_sum', 'timestamp_rhx_difference', 'timestamp_rhy_product', 'timestamp_rhy_sum', 'timestamp_rhy_difference', 'timestamp_hz_product', 'timestamp_hz_sum', 'timestamp_hz_difference', 'timestamp_lhx_prod

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Columns in C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_2_or_more_outliers_dropped.csv: ['lhx', 'lhy', 'rhx', 'rhy', 'hy', 'hz', 'sy', 'timestamp', '1', '3', '4', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', 'Phase', 'Subject_A', 'Subject_B', 'Subject_C', 'Story_1', 'Story_2', 'Story_3', 'sy_timestamp_product', 'sy_timestamp_sum', 'sy_timestamp_difference', 'sy_rhx_product', 'sy_rhx_sum', 'sy_rhx_difference', 'sy_rhy_product', 'sy_rhy_sum', 'sy_rhy_difference', 'sy_hz_product', 'sy_hz_sum', 'sy_hz_difference', 'sy_lhx_product', 'sy_lhx_sum', 'sy_lhx_difference', 'sy_lhy_product', 'sy_lhy_sum', 'sy_lhy_difference', 'timestamp_rhx_product', 'timestamp_rhx_sum', 'timestamp_rhx_difference', 'timestamp_rhy_product', 'timestamp_rhy_sum', 'timestamp_rhy_difference', 'timestamp_hz_product', 'timestamp_hz_sum', 'timestamp_hz_difference', 'timestamp_lh

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Columns in C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_3_or_more_outliers_dropped.csv: ['lhx', 'lhy', 'rhx', 'rhy', 'hy', 'hz', 'sy', 'timestamp', '1', '3', '4', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', 'Phase', 'Subject_A', 'Subject_B', 'Subject_C', 'Story_1', 'Story_2', 'Story_3', 'sy_timestamp_product', 'sy_timestamp_sum', 'sy_timestamp_difference', 'sy_rhx_product', 'sy_rhx_sum', 'sy_rhx_difference', 'sy_rhy_product', 'sy_rhy_sum', 'sy_rhy_difference', 'sy_hz_product', 'sy_hz_sum', 'sy_hz_difference', 'sy_lhx_product', 'sy_lhx_sum', 'sy_lhx_difference', 'sy_lhy_product', 'sy_lhy_sum', 'sy_lhy_difference', 'timestamp_rhx_product', 'timestamp_rhx_sum', 'timestamp_rhx_difference', 'timestamp_rhy_product', 'timestamp_rhy_sum', 'timestamp_rhy_difference', 'timestamp_hz_product', 'timestamp_hz_sum', 'timestamp_hz_difference', 'timestamp_lh

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


F1 scores with k-fold cross-validation saved at: C:\Users\reicd\Downloads\MY_ML_PROJECT\tests\f1 scores default table\f1_scores_kfold.csv


In [32]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Define the dataset paths
file_paths = [
    r"C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_all_outliers_dropped.csv",
    r"C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_2_or_more_outliers_dropped.csv",
    r"C:\Users\reicd\Downloads\MY_ML_PROJECT\data\engineered data\prepared_narrowed_3_or_more_outliers_dropped.csv"
]

# Assuming f1_scores has been populated correctly
# Here f1_scores = { "Model A": [score1, score2, score3], "Model B": [score1, score2, score3], ... }

# Simplified filenames for x-axis labels
simplified_labels = [os.path.basename(path).replace("prepared_narrowed_", "").replace(".csv", "") for path in file_paths]

# Create a DataFrame from the f1_scores
f1_scores_df = pd.DataFrame(f1_scores, index=simplified_labels)

# Print F1 scores DataFrame for verification
print(f1_scores_df)

# Define the directory to save the plot
plots_dir = r"C:\Users\reicd\Downloads\MY_ML_PROJECT\plots\F1 score default algorithms"

# Ensure the plots directory exists
os.makedirs(plots_dir, exist_ok=True)

# Plotting the F1 Scores for Each Model for Each Dataset
fig, ax = plt.subplots(figsize=(15, 8))  # Increased figure size for better visibility
f1_scores_df.plot(kind='bar', width=0.8, ax=ax)

# Adding labels on top of the bars
for p in ax.patches:
    ax.annotate(f"{p.get_height():.2f}", 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=10)

# Improve the aesthetics
plt.title('Average F1 Scores for Each Model Across Datasets', fontsize=18)
plt.xlabel('Datasets', fontsize=14)
plt.ylabel('Average F1 Score', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1)  # Set y-axis limit to match F1 score range
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Adjusting the legend position outside the plot area
ax.legend(title='Models', loc='upper left', bbox_to_anchor=(1, 1), fontsize=10)

plt.tight_layout()  # Adjust layout to make sure everything fits without overlapping

# Save the chart
average_chart_file_path = os.path.join(plots_dir, "average_f1_scores_per_dataset_labeled.png")
plt.savefig(average_chart_file_path, bbox_inches='tight')
plt.close()

print(f"Bar chart of average F1 scores for each model across datasets saved at: {average_chart_file_path}")

                            Decision Tree  Random Forest  Gradient Boosting  \
all_outliers_dropped             0.808182       0.916457           0.842582   
2_or_more_outliers_dropped       0.805392       0.918214           0.833996   
3_or_more_outliers_dropped       0.814209       0.921071           0.834963   

                             XGBoost  Support Vector Machine  Extra Trees  \
all_outliers_dropped        0.913766                0.734062      0.92716   
2_or_more_outliers_dropped  0.917964                0.724028      0.92899   
3_or_more_outliers_dropped  0.920728                0.719386      0.93237   

                            Naive Bayes  Perceptron  
all_outliers_dropped           0.531379    0.583712  
2_or_more_outliers_dropped     0.517191    0.548322  
3_or_more_outliers_dropped     0.516871    0.556858  
Bar chart of average F1 scores for each model across datasets saved at: C:\Users\reicd\Downloads\MY_ML_PROJECT\plots\F1 score default algorithms\average_f1_sc

In [14]:
!pip install joblib




In [22]:
import os
import joblib  # Library to save models

# Define the directory to save models
models_dir = r"C:\Users\reicd\Downloads\MY_ML_PROJECT\models"
default_models_dir = os.path.join(models_dir, "default")

# Ensure the "default" models directory exists
os.makedirs(default_models_dir, exist_ok=True)

# Save each trained model in the "default" folder
for model_name, model in models.items():
    # Create a file name based on model name
    model_file_path = os.path.join(default_models_dir, f"default_run_{model_name}.joblib")
    
    # Save the model
    joblib.dump(model, model_file_path)
    
    # Print confirmation
    print(f"Saved {model_name} model at: {model_file_path}")

Saved Decision Tree model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_Decision Tree.joblib
Saved Random Forest model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_Random Forest.joblib
Saved Gradient Boosting model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_Gradient Boosting.joblib
Saved XGBoost model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_XGBoost.joblib
Saved Support Vector Machine model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_Support Vector Machine.joblib
Saved Extra Trees model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_Extra Trees.joblib
Saved Naive Bayes model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_Naive Bayes.joblib
Saved Perceptron model at: C:\Users\reicd\Downloads\MY_ML_PROJECT\models\default\default_run_Perceptron.joblib
