In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, 
    classification_report, mean_squared_error, r2_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
import joblib  # For saving models
from notebooks.functions import BasicDataDHandelFunctions
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# For handling warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)


In [None]:
file_path = '../data/sample.csv'
df = BasicDataDHandelFunctions.load_dataset(file_path)

In [None]:
# Load Dataset with Error Handling
def load_dataset(file_path):
    """Loads a dataset from a specified path."""
    try:
        df = pd.read_csv(file_path)
        logging.info(f"Dataset loaded successfully from {file_path}")
        return df
    except FileNotFoundError:
        logging.error(f"File not found at {file_path}")
    except Exception as e:
        logging.error(f"An error occurred: {e}")

# Replace with your dataset path or URL
file_path = 'path/to/your_dataset.csv'
df = load_dataset(file_path)

# Display the first few rows if the dataset was loaded successfully
if df is not None:
    display(df.head())


In [None]:
# Data Preparation - Splitting the Data
def prepare_data(df, target_column, test_size=0.2, val_size=0.1):
    """Splits the data into train, validation, and test sets."""
    if target_column not in df.columns:
        logging.error(f"Target column '{target_column}' not found in DataFrame.")
        return None, None, None, None, None, None

    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

    # Further split the train set into train and validation sets
    val_ratio = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio, random_state=SEED)

    logging.info("Data split into train, validation, and test sets.")
    return X_train, X_val, X_test, y_train, y_val, y_test

# Specify the target column
target_column = 'target'  # Replace with your actual target column

if df is not None:
    X_train, X_val, X_test, y_train, y_val, y_test = prepare_data(df, target_column)


In [None]:
# Model Selection - Choosing Classifier Models
def get_models():
    """Returns a dictionary of candidate models."""
    models = {
        'Logistic Regression': LogisticRegression(),
        'Random Forest': RandomForestClassifier(random_state=SEED),
        'Support Vector Machine': SVC(random_state=SEED)
    }
    return models

# Get the models
models = get_models()
logging.info(f"Available models: {list(models.keys())}")


In [None]:
# Train and Evaluate Models
def train_and_evaluate(models, X_train, X_val, y_train, y_val):
    """Trains each model and evaluates performance on the validation set."""
    results = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        acc = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred, average='weighted')
        recall = recall_score(y_val, y_pred, average='weighted')
        f1 = f1_score(y_val, y_pred, average='weighted')

        logging.info(f"{name} - Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

        results.append({
            'Model': name,
            'Accuracy': acc,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })

    return pd.DataFrame(results)

if X_train is not None and X_val is not None:
    results_df = train_and_evaluate(models, X_train, X_val, y_train, y_val)
    display(results_df)


In [None]:
# Hyperparameter Tuning with Grid Search
def hyperparameter_tuning(model, param_grid, X_train, y_train):
    """Performs hyperparameter tuning using Grid Search."""
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)
    
    logging.info(f"Best Parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Example: Tuning Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

if X_train is not None:
    best_rf = hyperparameter_tuning(RandomForestClassifier(random_state=SEED), rf_param_grid, X_train, y_train)


In [None]:
# Evaluate the Best Model on Test Data
def evaluate_on_test(model, X_test, y_test):
    """Evaluates the final model on the test set."""
    y_pred = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

if X_test is not None:
    evaluate_on_test(best_rf, X_test, y_test)


In [None]:
# Save the Best Model
if df is not None:
    BasicDataDHandelFunctions.save_dataset_csv(df=best_rf, output_path='best_random_forest.pkl')
