In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [3]:
train.isnull().sum()
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [4]:
# handiling 86 missingg age values 
train['Age']=train['Age'].fillna(train['Age'].median())
test['Age']=test['Age'].fillna(test['Age'].median())
test['Fare'] = test['Fare'].fillna(train['Fare'].median())
train=train.drop('Cabin',axis=1)
test=test.drop('Cabin',axis=1)
train.isnull().sum()
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [5]:

def feature_engineering_data(train, test):
    # Family features
    train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
    test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
    
    train['IsAlone'] = (train['FamilySize'] == 1).astype(int)
    test['IsAlone'] = (test['FamilySize'] == 1).astype(int)
    
    # Title extraction and encoding
    train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                   'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    
    for df in [train, test]:
        df['Title'] = df['Title'].replace(rare_titles, 'Rare')
        df['Title'] = df['Title'].replace('Mlle', 'Miss')
        df['Title'] = df['Title'].replace('Ms', 'Miss')
        df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    train['Title'] = le.fit_transform(train['Title'])
    test['Title'] = le.transform(test['Title'])
    
    # FIX: Handle Sex encoding properly (fill NaN first)
    # Option 1: Fill NaN with mode before encoding
    train['Sex'] = train['Sex'].fillna(train['Sex'].mode()[0])
    test['Sex'] = test['Sex'].fillna(train['Sex'].mode()[0])
    
    train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})
    test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})
    
    # FIX: Handle Embarked encoding (fill NaN first)
    train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
    test['Embarked'] = test['Embarked'].fillna(train['Embarked'].mode()[0])
    
    train['Embarked'] = train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    # Drop text columns
    train = train.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
    test_ids = test['PassengerId'].copy()
    test = test.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
    
    # Final check: ensure all numeric
    print("✓ Feature engineering complete")
    print(f"Train dtypes:\n{train.dtypes}")
    
    return train, test, test_ids
train, test, test_ids = feature_engineering_data(train, test)

✓ Feature engineering complete
Train dtypes:
Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked        int64
FamilySize      int64
IsAlone         int64
Title           int64
dtype: object


In [6]:
# Check Sex column before encoding
print("Unique values in train Sex:", train['Sex'].unique())
print("Unique values in test Sex:", test['Sex'].unique())

# Check for NaN
print("\nNaN in train Sex:", train['Sex'].isnull().sum())
print("NaN in test Sex:", test['Sex'].isnull().sum())

# Check if there are unexpected values
print("\nValue counts train:", train['Sex'].value_counts())
print("Value counts test:", test['Sex'].value_counts())


Unique values in train Sex: [0 1]
Unique values in test Sex: [0 1]

NaN in train Sex: 0
NaN in test Sex: 0

Value counts train: Sex
0    577
1    314
Name: count, dtype: int64
Value counts test: Sex
0    266
1    152
Name: count, dtype: int64


In [7]:
# Check current state
print("Train columns:", train.columns.tolist())
print("Train shape:", train.shape)
print("Train dtypes:\n", train.dtypes)

# If all columns are numeric, you can proceed directly!
if train.dtypes.apply(lambda x: x.kind in 'biufc').all():
    print("\n✓ All columns are numeric - ready for modeling!")
    
    # Split and train
    X = train.drop('Survived', axis=1)
    y = train['Survived']
    
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    val_pred = rf.predict(X_val)
    print(f"\nValidation Accuracy: {accuracy_score(y_val, val_pred):.4f}")
    
    # Make final predictions
    final_pred = rf.predict(test)
    
    submission = pd.DataFrame({
        'PassengerId': test_ids,
        'Survived': final_pred
    })
    submission.to_csv('submission.csv', index=False)
    print("\n✓ Submission file created!")


Train columns: ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone', 'Title']
Train shape: (891, 11)
Train dtypes:
 Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked        int64
FamilySize      int64
IsAlone         int64
Title           int64
dtype: object

✓ All columns are numeric - ready for modeling!

Validation Accuracy: 0.8380

✓ Submission file created!
