In [34]:
import pandas as pd
titanic_file = 'train.csv'
test_file = 'test.csv'
data = pd.read_csv(titanic_file)
test_data = pd.read_csv(test_file)
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [35]:
# Step 1: Handle missing values

# Fill missing Age with median grouped by Pclass and Sex

data['Age'] = data.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))



# Fill missing Embarked with the mode

data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)



# Drop the Cabin column due to high missingness

data.drop('Cabin', axis=1, inplace=True)



# Step 2: Convert categorical variables

# Encode 'Sex' and 'Embarked'

data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})



# Step 3: Drop unnecessary columns

data_cleaned = data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)



# Check the cleaned data

data_cleaned.info(), data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)


(None,
    Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
 0         0       3    0  22.0      1      0   7.2500         2
 1         1       1    1  38.0      1      0  71.2833         0
 2         1       3    1  26.0      0      0   7.9250         2
 3         1       1    1  35.0      1      0  53.1000         2
 4         0       3    0  35.0      0      0   8.0500         2)

In [36]:
# Step 1: Handle missing values

# Fill missing Age with median grouped by Pclass and Sex

test_data['Age'] = test_data.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))



# Fill missing Embarked with the mode

test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

# Fill missing Fare with the mode

test_data['Fare'].fillna(test_data['Fare'].mode()[0], inplace=True)


# Drop the Cabin column due to high missingness

test_data.drop('Cabin', axis=1, inplace=True)



# Step 2: Convert categorical variables

# Encode 'Sex' and 'Embarked'

test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

test_data['Embarked'] = test_data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})



# Step 3: Drop unnecessary columns

test_data_cleaned = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)



# Check the cleaned data

test_data_cleaned.info(), test_data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    int64  
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 23.0 KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Fare'].fillna(test_data['Fare'].mode()[0], inplace=True)


(None,
    Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
 0       3    0  34.5      0      0   7.8292         1
 1       3    1  47.0      1      0   7.0000         2
 2       2    0  62.0      0      0   9.6875         1
 3       3    0  27.0      0      0   8.6625         2
 4       3    1  22.0      1      1  12.2875         2)

In [37]:
# Step 1: Create FamilySize feature

data_cleaned['FamilySize'] = data_cleaned['SibSp'] + data_cleaned['Parch'] + 1



# Step 2: Bin Age into categories

bins_age = [0, 12, 18, 35, 60, 100]  # Age groups: Child, Teen, Adult, Senior

labels_age = ['Child', 'Teen', 'Adult', 'Middle_Aged', 'Senior']

data_cleaned['AgeGroup'] = pd.cut(data_cleaned['Age'], bins=bins_age, labels=labels_age)



# Encode AgeGroup numerically

data_cleaned['AgeGroup'] = data_cleaned['AgeGroup'].cat.codes



# Step 3: Bin Fare into categories

bins_fare = [-1, 7.91, 14.454, 31.0, 512.3292]  # Fare categories based on dataset distribution

labels_fare = ['Low', 'Mid-Low', 'Mid-High', 'High']

data_cleaned['FareCategory'] = pd.cut(data_cleaned['Fare'], bins=bins_fare, labels=labels_fare)



# Encode FareCategory numerically

data_cleaned['FareCategory'] = data_cleaned['FareCategory'].cat.codes



# Step 4: Drop SibSp and Parch as FamilySize replaces them

data_final = data_cleaned.drop(['SibSp', 'Parch'], axis=1)



# Check the final feature set

data_final.info(), data_final.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Pclass        891 non-null    int64  
 2   Sex           891 non-null    int64  
 3   Age           891 non-null    float64
 4   Fare          891 non-null    float64
 5   Embarked      891 non-null    int64  
 6   FamilySize    891 non-null    int64  
 7   AgeGroup      891 non-null    int8   
 8   FareCategory  891 non-null    int8   
dtypes: float64(2), int64(5), int8(2)
memory usage: 50.6 KB


(None,
    Survived  Pclass  Sex   Age     Fare  Embarked  FamilySize  AgeGroup  \
 0         0       3    0  22.0   7.2500         2           2         2   
 1         1       1    1  38.0  71.2833         0           2         3   
 2         1       3    1  26.0   7.9250         2           1         2   
 3         1       1    1  35.0  53.1000         2           2         2   
 4         0       3    0  35.0   8.0500         2           1         2   
 
    FareCategory  
 0             0  
 1             3  
 2             1  
 3             3  
 4             1  )

In [38]:
# Step 1: Create FamilySize feature

test_data_cleaned['FamilySize'] = test_data_cleaned['SibSp'] + test_data_cleaned['Parch'] + 1



# Step 2: Bin Age into categories

bins_age = [0, 12, 18, 35, 60, 100]  # Age groups: Child, Teen, Adult, Senior

labels_age = ['Child', 'Teen', 'Adult', 'Middle_Aged', 'Senior']

test_data_cleaned['AgeGroup'] = pd.cut(test_data_cleaned['Age'], bins=bins_age, labels=labels_age)



# Encode AgeGroup numerically

test_data_cleaned['AgeGroup'] = test_data_cleaned['AgeGroup'].cat.codes



# Step 3: Bin Fare into categories

bins_fare = [-1, 7.91, 14.454, 31.0, 512.3292]  # Fare categories based on dataset distribution

labels_fare = ['Low', 'Mid-Low', 'Mid-High', 'High']

test_data_cleaned['FareCategory'] = pd.cut(test_data_cleaned['Fare'], bins=bins_fare, labels=labels_fare)



# Encode FareCategory numerically

test_data_cleaned['FareCategory'] = test_data_cleaned['FareCategory'].cat.codes



# Step 4: Drop SibSp and Parch as FamilySize replaces them

test_data_final = test_data_cleaned.drop(['SibSp', 'Parch'], axis=1)



# Check the final feature set

test_data_final.info(), test_data_final.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Pclass        418 non-null    int64  
 1   Sex           418 non-null    int64  
 2   Age           418 non-null    float64
 3   Fare          418 non-null    float64
 4   Embarked      418 non-null    int64  
 5   FamilySize    418 non-null    int64  
 6   AgeGroup      418 non-null    int8   
 7   FareCategory  418 non-null    int8   
dtypes: float64(2), int64(4), int8(2)
memory usage: 20.5 KB


(None,
    Pclass  Sex   Age     Fare  Embarked  FamilySize  AgeGroup  FareCategory
 0       3    0  34.5   7.8292         1           1         2             0
 1       3    1  47.0   7.0000         2           2         3             0
 2       2    0  62.0   9.6875         1           1         4             1
 3       3    0  27.0   8.6625         2           1         2             1
 4       3    1  22.0  12.2875         2           3         2             1)

In [45]:

from sklearn.preprocessing import StandardScaler



# Define features (X) and target (y)

X_train = data_final.drop('Survived', axis=1)

y_train = data_final['Survived']


X_test = test_data_final

# Scale numerical features

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)



X_train_scaled[:5], X_test_scaled[:5]  # Display a few scaled samples for verification

(array([[ 0.82737724, -0.73769513, -0.53489116, -0.50244517,  0.58595414,
          0.05915988, -0.06848951, -1.34677659],
        [-1.56610693,  1.35557354,  0.66839176,  0.78684529, -1.9423032 ,
          0.05915988,  1.10505182,  1.33773782],
        [ 0.82737724,  1.35557354, -0.23407043, -0.48885426,  0.58595414,
         -0.56097483, -0.06848951, -0.45193845],
        [-1.56610693,  1.35557354,  0.44277621,  0.42073024,  0.58595414,
          0.05915988, -0.06848951,  1.33773782],
        [ 0.82737724, -0.73769513,  0.44277621, -0.48633742,  0.58595414,
         -0.56097483, -0.06848951, -0.45193845]]),
 array([[ 0.82737724, -0.73769513,  0.40517362, -0.49078316, -0.67817453,
         -0.56097483, -0.06848951, -1.34677659],
        [ 0.82737724,  1.35557354,  1.3452384 , -0.50747884,  0.58595414,
          0.05915988,  1.10505182, -1.34677659],
        [-0.36936484, -0.73769513,  2.47331614, -0.45336687, -0.67817453,
         -0.56097483,  2.27859315, -0.45193845],
        [ 0.82

In [None]:
X_test = test_data_final[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']]

KeyError: "['PassengerId'] not in index"

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test_scaled)

In [53]:
print("Rows in test_data_final:", test_data_final.shape[0])
print("Rows in X_test_scaled:", X_test_scaled.shape[0])
print("Length of y_pred_test:", len(y_pred))

Rows in test_data_final: 418
Rows in X_test_scaled: 418
Length of y_pred_test: 418


In [56]:
test_data['Survived'] = y_pred
test_data[['PassengerId', 'Survived']].to_csv("predictions.csv", index=False)