In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import joblib
import os
from datetime import datetime

# Global variables for hyperparameters
LOGISTIC_REGRESSION_PARAMS = {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear'], 'max_iter': [1000]}
LDA_PARAMS = {'solver': ['svd', 'lsqr']}
QDA_PARAMS = {}
KNN_PARAMS = {'n_neighbors': [3, 5, 7]}
DECISION_TREE_PARAMS = {'max_depth': [3, 5, 7]}
RANDOM_FOREST_PARAMS = {'n_estimators': [50, 100], 'max_depth': [3, 5, 7]}
EXTRA_TREES_PARAMS = {'n_estimators': [50, 100], 'max_depth': [3, 5, 7]}
GRADIENT_BOOSTING_PARAMS = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
XGBOOST_PARAMS = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
LIGHTGBM_PARAMS = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5], 'verbose': [-1]}
CATBOOST_PARAMS = {'iterations': [50, 100], 'learning_rate': [0.01, 0.1], 'depth': [3, 5]}
SVM_PARAMS = {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']}
NU_SVC_PARAMS = {'nu': [0.1, 0.5], 'kernel': ['rbf', 'linear']}
LINEAR_SVC_PARAMS = {'C': [0.1, 1, 10]}
GAUSSIAN_NB_PARAMS = {}
MLP_PARAMS = {'hidden_layer_sizes': [(50,), (100,)], 'max_iter': [500, 1000]}
GAUSSIAN_PROCESS_PARAMS = {'max_iter_predict': [100], 'kernel': [1.0 * RBF(1.0)]}
SGD_PARAMS = {'max_iter': [500, 1000], 'tol': [1e-3, 1e-4]}
PASSIVE_AGGRESSIVE_PARAMS = {'C': [0.1, 1, 10], 'max_iter': [500, 1000], 'tol': [1e-3, 1e-4]}
PERCEPTRON_PARAMS = {'max_iter': [500, 1000], 'tol': [1e-3, 1e-4]}
RIDGE_PARAMS = {'alpha': [0.1, 1, 10]}
BAGGING_PARAMS = {'n_estimators': [5, 10]}
ADABOOST_PARAMS = {'n_estimators': [50, 100], 'learning_rate': [0.1, 1.0]}
HISTOGRAM_GBM_PARAMS = {'max_iter': [50, 100], 'learning_rate': [0.1, 1.0]}

# Flag to enable grid search
ENABLE_GRID_SEARCH = False

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    print("Loading and preprocessing data...")
    try:
        data = pd.read_csv(file_path)
        X = data.drop('label', axis=1)
        y = data['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Data loaded and preprocessed successfully.")
        return X_train_scaled, X_test_scaled, y_train, y_test
    except Exception as e:
        print(f"Error in loading and preprocessing data: {str(e)}")
        raise

# Function to train and evaluate a model
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, params):
    print(f"Training and evaluating {model_name}...")
    try:
        if ENABLE_GRID_SEARCH and hasattr(model, 'get_params'):
            try:
                grid_search = GridSearchCV(model, params, cv=5, n_jobs=-1)
                grid_search.fit(X_train, y_train)
                model = grid_search.best_estimator_
                print(f"Best parameters for {model_name}: {grid_search.best_params_}")
            except Exception as e:
                print(f"Grid search failed for {model_name}: {str(e)}")
                print("Falling back to default parameters.")
                model.fit(X_train, y_train)
        else:
            model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        print(f"{model_name} trained and evaluated successfully. Accuracy: {accuracy:.4f}")
        return model, accuracy, report
    except Exception as e:
        print(f"Error in training and evaluating {model_name}: {str(e)}")
        return None, None, None

# Function to save model
def save_model(model, model_name, params):
    print(f"Saving {model_name}...")
    try:
        if not os.path.exists('Models'):
            os.makedirs('Models')
        param_str = '_'.join([f"{k}_{v}" for k, v in params.items() if k != 'estimators'])[:100]  # Limit filename length
        filename = f"Models/{model_name}_{param_str}.joblib"
        joblib.dump(model, filename)
        print(f"{model_name} saved successfully.")
    except Exception as e:
        print(f"Error in saving {model_name}: {str(e)}")

# Function to create VotingClassifier
def create_voting_classifier(X_train, y_train):
    estimators = []
    for name, model in [('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('svm', SVC())]:
        try:
            model.fit(X_train, y_train)
            estimators.append((name, model))
        except Exception as e:
            print(f"Error fitting {name} for VotingClassifier: {str(e)}")
    return VotingClassifier(estimators=estimators)

# Main pipeline function
def run_pipeline(input_file):
    print("Starting the pipeline...")
    try:
        X_train, X_test, y_train, y_test = load_and_preprocess_data(input_file)
        
        # Check number of classes
        n_classes = len(np.unique(y_train))
        print(f"Number of classes in the dataset: {n_classes}")
        if n_classes < 2:
            raise ValueError("The dataset must contain at least 2 classes for classification tasks.")
        
        models = [
            (LogisticRegression(), "LogisticRegression", LOGISTIC_REGRESSION_PARAMS),
            (LinearDiscriminantAnalysis(), "LDA", LDA_PARAMS),
            (QuadraticDiscriminantAnalysis(), "QDA", QDA_PARAMS),
            (KNeighborsClassifier(), "KNN", KNN_PARAMS),
            (DecisionTreeClassifier(), "DecisionTree", DECISION_TREE_PARAMS),
            (RandomForestClassifier(), "RandomForest", RANDOM_FOREST_PARAMS),
            (ExtraTreesClassifier(), "ExtraTrees", EXTRA_TREES_PARAMS),
            (GradientBoostingClassifier(), "GradientBoosting", GRADIENT_BOOSTING_PARAMS),
            (XGBClassifier(), "XGBoost", XGBOOST_PARAMS),
            (LGBMClassifier(), "LightGBM", LIGHTGBM_PARAMS),
            (CatBoostClassifier(verbose=False), "CatBoost", CATBOOST_PARAMS),
            (SVC(), "SVM", SVM_PARAMS),
            (NuSVC(), "NuSVC", NU_SVC_PARAMS),
            (LinearSVC(), "LinearSVC", LINEAR_SVC_PARAMS),
            (GaussianNB(), "GaussianNB", GAUSSIAN_NB_PARAMS),
            (MLPClassifier(), "MLP", MLP_PARAMS),
            (GaussianProcessClassifier(), "GaussianProcess", GAUSSIAN_PROCESS_PARAMS),
            (SGDClassifier(), "SGD", SGD_PARAMS),
            (PassiveAggressiveClassifier(), "PassiveAggressive", PASSIVE_AGGRESSIVE_PARAMS),
            (Perceptron(), "Perceptron", PERCEPTRON_PARAMS),
            (RidgeClassifier(), "Ridge", RIDGE_PARAMS),
            (create_voting_classifier(X_train, y_train), "Voting", {}),
            (BaggingClassifier(), "Bagging", BAGGING_PARAMS),
            (AdaBoostClassifier(), "AdaBoost", ADABOOST_PARAMS),
            (HistGradientBoostingClassifier(), "HistGradientBoosting", HISTOGRAM_GBM_PARAMS),
        ]
        
        results = []
        
        for model, model_name, params in models:
            try:
                trained_model, accuracy, report = train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name, params)
                if trained_model is not None:
                    save_model(trained_model, model_name, params)
                    results.append({
                        'Model': model_name,
                        'Accuracy': accuracy,
                        'Report': report
                    })
            except Exception as e:
                print(f"Error occurred while processing {model_name}: {str(e)}")
        
        results_df = pd.DataFrame(results)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_df.to_csv(f"Results_{timestamp}.csv", index=False)
        print(f"Results saved to Results_{timestamp}.csv")
        print("Pipeline completed successfully.")
    except Exception as e:
        print(f"Error in running the pipeline: {str(e)}")

# Run the pipeline
if __name__ == "__main__":
    input_file = "../dataset/final_dataset.csv"  
    run_pipeline(input_file)

Starting the pipeline...
Loading and preprocessing data...
Data loaded and preprocessed successfully.
Number of classes in the dataset: 6


In [5]:
data = pd.read_csv(input_file)
X = data.drop('label', axis=1)
y = data['label']

print(np.isnan(y))

# print("Indices of NaN values:", nan_indices)


0         False
1         False
2         False
3         False
4         False
          ...  
622981     True
622982     True
622983     True
622984     True
622985     True
Name: label, Length: 622986, dtype: bool
