In [7]:
import pandas as pd

# Load the dataset
train_df = pd.read_csv("/kaggle/input/dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/dataset/test.csv")

# Handle missing values for 'Age' column in both train and test datasets
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())

# Handle missing values for 'Fare' column in both train and test datasets
train_df['Fare'] = train_df['Fare'].fillna(train_df['Fare'].median())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

# Drop Cabin column (too many missing values)
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)

# Feature engineering - Create FamilySize feature
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# Encode categorical variables (Sex and Embarked)
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'])
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'])

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_df[['Age', 'Fare']] = scaler.fit_transform(train_df[['Age', 'Fare']])
test_df[['Age', 'Fare']] = scaler.transform(test_df[['Age', 'Fare']])

# Final Data Preparation
drop_columns = ['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch']
train_df.drop(drop_columns, axis=1, inplace=True)
test_df.drop(drop_columns, axis=1, inplace=True)

# Separate features and target variable
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']
X_test = test_df

# Check final data before modeling
print(X_train.head())


   Pclass       Age      Fare  FamilySize  Sex_female  Sex_male  Embarked_C  \
0       3 -0.565736 -0.502445           2       False      True       False   
1       1  0.663861  0.786845           2        True     False        True   
2       3 -0.258337 -0.488854           1        True     False       False   
3       1  0.433312  0.420730           2        True     False       False   
4       3  0.433312 -0.486337           1       False      True       False   

   Embarked_Q  Embarked_S  
0       False        True  
1       False       False  
2       False        True  
3       False        True  
4       False        True  
