In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Data preprocessing
def preprocess_data(data):
    # Drop unnecessary columns
    data = data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    
    # Convert categorical features to numerical
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
    
    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    data[['Age', 'Fare']] = imputer.fit_transform(data[['Age', 'Fare']])
    
    return data

# Apply preprocessing to training and test data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Split the training data into features and labels
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
predictions = model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, predictions)
print(f"Validation Accuracy: {accuracy}")

# Make predictions on the test set
test_predictions = model.predict(test_data)

# Create a DataFrame with the predictions and save to CSV
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
submission.to_csv('titanic_predictions.csv', index=False)

Validation Accuracy: 0.8324022346368715


