In [None]:
# Import Required Libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

print("Libraries imported successfully")

Libraries imported successfully


## 1. Load Data

In [76]:
# Load the training dataset
df = pd.read_csv('train.csv')

print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


## 2. Clean Missing Values and Duplicates

In [77]:
print("Missing values:")
print(df.isnull().sum())
print(f"\nDuplicates: {df.duplicated().sum()}")

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Duplicates: 0


In [78]:
# Remove duplicates and fill missing values
df = df.drop_duplicates()
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

print(f"Shape after cleaning: {df.shape}")
print(f"Missing values remaining: {df.isnull().sum().sum()}")

Shape after cleaning: (891, 12)
Missing values remaining: 687


## 3. Separate Useful and Useless Features

In [79]:
# Drop useless features
useless_features = ['Name', 'Ticket', 'Cabin', 'PassengerId']
df = df.drop(columns=useless_features)

print(f"Remaining columns: {df.columns.tolist()}")

Remaining columns: ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


## 4. Convert Categorical Values to Numeric (Mapping)

In [80]:
# Map categorical to numeric
sex_mapping = {'male': 1, 'female': 2}
embarked_mapping = {'S': 1, 'C': 2, 'Q': 3}

df['Sex'] = df['Sex'].map(sex_mapping)
df['Embarked'] = df['Embarked'].map(embarked_mapping)

print("After mapping:")
print(df.head())

After mapping:
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         1
1         1       1    2  38.0      1      0  71.2833         2
2         1       3    2  26.0      0      0   7.9250         1
3         1       1    2  35.0      1      0  53.1000         1
4         0       3    1  35.0      0      0   8.0500         1


## 5. Scale Age Values

In [81]:
# Create Age groups
df['Age_Group'] = pd.cut(df['Age'], 
                          bins=[0, 12, 20, 30, 40, 50, 60, 70, 80], 
                          labels=[1, 2, 3, 4, 5, 6, 7, 8])
df['Age_Group'] = df['Age_Group'].astype(int)

print("Age groups (1: 0-12, 2: 13-20, 3: 21-30, 4: 31-40, 5: 41-50, 6: 51-60, 7: 61-70, 8: 71-80)")

Age groups (1: 0-12, 2: 13-20, 3: 21-30, 4: 31-40, 5: 41-50, 6: 51-60, 7: 61-70, 8: 71-80)


## Final Result

In [82]:
print("Preprocessed dataset:")
print(df.head())

Preprocessed dataset:
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Age_Group
0         0       3    1  22.0      1      0   7.2500         1          3
1         1       1    2  38.0      1      0  71.2833         2          4
2         1       3    2  26.0      0      0   7.9250         1          3
3         1       1    2  35.0      1      0  53.1000         1          4
4         0       3    1  35.0      0      0   8.0500         1          4


## 6. Split Data (90% Train / 10% Test)

In [83]:
# Separate features and target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Split data: 90% train, 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print("Target distribution in training set:")
print(y_train.value_counts())

Training set: (801, 8)
Test set: (90, 8)
Target distribution in training set:
Survived
0    495
1    306
Name: count, dtype: int64


## 7. Train the Model

In [84]:
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

print("Model trained successfully")

Model trained successfully


## 8. Test the Model on Remaining 10%

In [85]:
# Make predictions on test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Survived', 'Survived']))

Test Accuracy: 0.8444 (84.44%)
Classification Report:
              precision    recall  f1-score   support

Not Survived       0.88      0.85      0.87        54
    Survived       0.79      0.83      0.81        36

    accuracy                           0.84        90
   macro avg       0.84      0.84      0.84        90
weighted avg       0.85      0.84      0.85        90



## 9. Apply Model to test.csv

In [86]:
# Load test.csv
df_test = pd.read_csv('test.csv')
passenger_ids = df_test['PassengerId'].copy()

print(df_test.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


In [87]:
# Apply same preprocessing to test.csv
df_test = df_test.drop_duplicates()
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].median())
df_test['Embarked'] = df_test['Embarked'].fillna(df_test['Embarked'].mode()[0])
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].median())
df_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'])
df_test['Sex'] = df_test['Sex'].map(sex_mapping)
df_test['Embarked'] = df_test['Embarked'].map(embarked_mapping)
df_test['Age_Group'] = pd.cut(df_test['Age'], 
                               bins=[0, 12, 20, 30, 40, 50, 60, 70, 80], 
                               labels=[1, 2, 3, 4, 5, 6, 7, 8])
df_test['Age_Group'] = df_test['Age_Group'].astype(int)

print("Test dataset preprocessed")
print(df_test.head())

Test dataset preprocessed
   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Age_Group
0       3    1  34.5      0      0   7.8292         3          4
1       3    2  47.0      1      0   7.0000         1          5
2       2    1  62.0      0      0   9.6875         3          7
3       3    1  27.0      0      0   8.6625         1          3
4       3    2  22.0      1      1  12.2875         1          3


In [88]:
# Make predictions on test.csv
predictions = rf_model.predict(df_test)
print("Prediction distribution:")
print(f"Not Survived (0): {sum(predictions == 0)}")
print(f"Survived (1): {sum(predictions == 1)}")

Prediction distribution:
Not Survived (0): 278
Survived (1): 140


In [89]:
# Create submission file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': predictions
})

submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")
print("Submission preview:")
print(submission.head(20))

Submission file created: submission.csv
Submission preview:
    PassengerId  Survived
0           892         0
1           893         0
2           894         0
3           895         0
4           896         0
5           897         0
6           898         0
7           899         0
8           900         1
9           901         0
10          902         0
11          903         0
12          904         1
13          905         0
14          906         1
15          907         1
16          908         0
17          909         0
18          910         0
19          911         0
