In [None]:
import pandas as pd
train_data = pd.read_csv('train.csv')
train_data.describe(include='all')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Drop rows with missing values
train_data_clean = train_data.dropna()

# Encode categorical variables
le = LabelEncoder()
for col in train_data_clean.columns:
    if train_data_clean[col].dtype == 'object':
        train_data_clean[col] = le.fit_transform(train_data_clean[col])

# Split data into features and target
X = train_data_clean.drop('Transported', axis=1)
y = train_data_clean['Transported']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Get feature importances
importances = clf.feature_importances_
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)
feature_importances

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Initialize the IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=42)

# Fit and transform the test data to fill missing values
test_data_clean_imputed = imp.fit_transform(test_data_clean)

# Convert the imputed data back to a DataFrame
test_data_clean_imputed = pd.DataFrame(test_data_clean_imputed, columns=test_data_clean.columns)

# Predict 'Transported' for the imputed test data
predictions = clf.predict(test_data_clean_imputed)

# Format predictions similar to sample_submission
submission = sample_submission.copy()
submission['Transported'] = predictions

# Save predictions as 'predict.csv'
submission.to_csv('predict.csv', index=False)
submission