In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from imblearn.over_sampling import SMOTE  # For balancing
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Load the dataset
data = pd.read_csv('/Users/rohitkumarchintamani/Downloads/data_public.csv')

In [4]:
# Separate features and target
X = data.drop('Class', axis=1)
y = data['Class']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Balancing the training dataset
#smote = SMOTE(random_state=42)
#X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

In [6]:
# Define the pipeline with Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=4)),
    ('selector', SelectKBest(k=2)),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20 ,30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Predictions and evaluation
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))