In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [19]:
# Load the data
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')

In [20]:
# Separate features and target variable
X = train_data.drop(['Id', 'Class'], axis=1)
y = train_data['Class']

In [21]:
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

In [22]:
# Perform one-hot encoding on the input features
X_encoded = pd.get_dummies(X)

In [23]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_encoded)

In [24]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [25]:
# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [26]:
# Make predictions on the validation set
y_pred = clf.predict(X_val)

In [27]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.9354838709677419


In [28]:
# Handle missing values in the test set
test_X = test_data.drop('Id', axis=1)
test_X_encoded = pd.get_dummies(test_X)

In [29]:
# Align test data with training data by adding missing columns
missing_cols = set(X_encoded.columns) - set(test_X_encoded.columns)
for col in missing_cols:
    test_X_encoded[col] = 0
test_X_encoded = test_X_encoded[X_encoded.columns]

In [30]:
# Handle missing values in the test set
test_X_imputed = imputer.transform(test_X_encoded)

In [31]:
# Make predictions on the test set
test_predictions = clf.predict(test_X_imputed)

In [32]:
# Decode the predicted labels
test_predictions = le.inverse_transform(test_predictions)

In [33]:
# Create a submission file
submission = pd.DataFrame({'Id': test_data['Id'], 'Class': test_predictions})
submission.to_csv('submission.csv', index=False)