In [20]:
def preprocess_data_linear_features(data, feature_prefix, target_column, test_ratio=0.25, imputer_strategy="median"):
    """
    Preprocesses the data, including imputation and splitting, without polynomial feature augmentation.
    
    Parameters:
    - data: The entire dataset.
    - feature_prefix: Prefix for feature columns.
    - target_column: Name of the target column.
    - test_ratio: Ratio for test set splitting.
    - imputer_strategy: Strategy to use for imputation ('mean', 'median', etc.)
    
    Returns:
    - X_train, y_train: Training data and labels
    - X_test, y_test: Test data and labels
    """
    
    # Extract features and target variable
    feature_cols = [col for col in data.columns if feature_prefix in col]
    X = data[feature_cols]
    y = data[target_column].astype(bool)

    # Impute missing values

    # Split the data into training and test sets
    train_size = int((1 - test_ratio) * len(data))
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

    return X_train, y_train, X_test, y_test
def train_and_evaluate_classifier(X_train, y_train, X_test, y_test, classifier, imputer_strategy="median"):
    """
    Train and evaluate a classifier given training and test data.
    
    Parameters:
    - X_train, y_train: Training data and labels
    - X_test, y_test: Test data and labels
    - classifier: The machine learning classifier to be trained
    - imputer_strategy: Strategy to use for imputation ('mean', 'median', etc.)
    
    Returns:
    - accuracy: Accuracy of the classifier on the test set
    - report: Classification report
    - conf_matrix: Confusion matrix
    """
    
    # Impute missing values in the training and test sets

    # Create a pipeline with data scaling and the classifier
    pipeline = make_pipeline(StandardScaler(), classifier)

    # Convert target variables to boolean type
    y_train = y_train.astype(bool)
    y_test = y_test.astype(bool)

    # Train the classifier using the imputed data
    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return accuracy, report, conf_matrix

import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

data = pd.read_csv("eth1h_withlabel_andfeatures.csv").dropna()
# Preprocess the data with imputation without polynomial feature augmentation
X_train_linear, y_train, X_test_linear, y_test = preprocess_data_linear_features(data, "Feature", "label", test_ratio=0.25)

# Train and evaluate the SVM classifier using only linear features
accuracy_linear, report_linear, conf_matrix_linear = train_and_evaluate_classifier(
    X_train_linear, y_train, X_test_linear, y_test, SVC())

accuracy_linear, report_linear, conf_matrix_linear


(0.6346300533943554,
 '              precision    recall  f1-score   support\n\n       False       0.63      0.94      0.76       791\n        True       0.65      0.17      0.28       520\n\n    accuracy                           0.63      1311\n   macro avg       0.64      0.56      0.52      1311\nweighted avg       0.64      0.63      0.57      1311\n',
 array([[741,  50],
        [429,  91]], dtype=int64))

In [17]:
conf_matrix_linear

array([[888, 119],
       [467,  99]], dtype=int64)

In [2]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.3.0-cp39-cp39-win_amd64.whl (9.3 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.3.0 threadpoolctl-3.2.0
