In [11]:
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
train_data = pd.read_csv("archive/Train_data.csv")

# Separate features and target variable
X_train = train_data.drop(columns=["class"])
y_train = train_data["class"]

# Encode labels if they are strings
label_encoders = {}
for column in X_train.columns:
    if X_train[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        X_train[column] = label_encoders[column].fit_transform(X_train[column])

# Encode target variable if it's a string
if y_train.dtype == 'object':
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)

# # Normalize the features
# scaler = StandardScaler()
# X_normalized = scaler.fit_transform(X)

# Initialize models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

# Train and evaluate models using cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"{name}: Mean MSE: {scores.mean()}, Std Dev: {scores.std()}")



Decision Tree: Mean MSE: 0.9953953761690967, Std Dev: 0.0005382895772571146
Random Forest: Mean MSE: 0.9971419356175464, Std Dev: 0.0005968586197382097
KNN: Mean MSE: 0.9886074801542017, Std Dev: 0.0008194810597688928


In [None]:
# Perform hyperparameter tuning for the best performing model
best_model = RandomForestClassifier()  # Example: Replace with the best performing model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best mean cross-validation score:", -grid_search.best_score_)

In [None]:
# Evaluate the best model on the test set
test_data = pd.read_csv("archive/Test_data.csv")

# Separate features and target variable
X_test = train_data.drop(columns=["class"])
y_test = train_data["class"]

for column in X_test.columns:
    if X_test[column].dtype == 'object':
        X_test[column] = label_encoders[column].fit_transform(X_test[column])

# Encode target variable if it's a string
if y_test.dtype == 'object':
    y_test = label_encoder.fit_transform(y_test)

best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test set score: {test_score}")
