In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score



In [2]:
# Load the data
train_data = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e2/sample_submission.csv')

In [3]:
# Separate features and target variable
X = train_data.drop(columns=['id', 'NObeyesdad'])
y = train_data['NObeyesdad']

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Preprocessing
# Define categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()



# Preprocessing
# One-hot encode categorical features with handling of unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)


In [5]:
# Model Building
# Define clustering techniques
kmeans = KMeans(n_clusters=5, random_state=42)
agglomerative = AgglomerativeClustering(n_clusters=5)

# Define classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Pipeline for clustering and classification
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('cluster', kmeans),  # Use KMeans for clustering
    ('classifier', knn_classifier)  # Use KNN classifier
])


In [6]:
# Train the model
pipeline.fit(X_train, y_train)



In [7]:
# Predict on validation set
val_preds = pipeline.predict(X_val)

In [8]:
# Evaluate model
accuracy = accuracy_score(y_val, val_preds)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.7950385356454721


In [9]:
# Make predictions on test set
test_preds = pipeline.predict(test_data.drop(columns=['id']))


In [10]:
# Create submission file
submission_df = pd.DataFrame({'id': test_data['id'], 'NObeyesdad': test_preds})
submission_df.to_csv('submission.csv', index=False)