In [3]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np

In [5]:
!pip install seaborn 



In [6]:
!pip install matplotlib



In [7]:
 !pip install scikit-learn



In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [10]:
# Step 2: Load the dataset
df = pd.read_csv('tested.csv')

In [11]:
# Step 3: Understand the structure of the dataset
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  


In [12]:
print("\nDataset information:")
print(df.info())


Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB
None


In [13]:
print("\nSummary statistics:")
print(df.describe(include='all'))


Summary statistics:
        PassengerId    Survived      Pclass                      Name   Sex  \
count    418.000000  418.000000  418.000000                       418   418   
unique          NaN         NaN         NaN                       418     2   
top             NaN         NaN         NaN  Peter, Master. Michael J  male   
freq            NaN         NaN         NaN                         1   266   
mean    1100.500000    0.363636    2.265550                       NaN   NaN   
std      120.810458    0.481622    0.841838                       NaN   NaN   
min      892.000000    0.000000    1.000000                       NaN   NaN   
25%      996.250000    0.000000    1.000000                       NaN   NaN   
50%     1100.500000    0.000000    3.000000                       NaN   NaN   
75%     1204.750000    1.000000    3.000000                       NaN   NaN   
max     1309.000000    1.000000    3.000000                       NaN   NaN   

               Age       SibSp

In [16]:
# Step 4: Check missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [17]:
# Drop columns with too many missing values or uninformative
df.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)

# Fill missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Verify no more missing values
print("\nMissing values after filling:")
print(df.isnull().sum())



Missing values after filling:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

In [18]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"\nCategorical columns to encode: {list(categorical_cols)}")

# Use LabelEncoder for binary/multiclass classification
le_dict = {}  # to keep track of encoders for inverse transformation

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le
    print(f"Encoded '{col}' with mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")



Categorical columns to encode: ['Sex', 'Embarked']
Encoded 'Sex' with mapping: {'female': np.int64(0), 'male': np.int64(1)}
Encoded 'Embarked' with mapping: {'C': np.int64(0), 'Q': np.int64(1), 'S': np.int64(2)}


In [19]:
# Final check
print("\nData types after encoding:")
print(df.dtypes)

print("\nFinal preprocessed dataset:")
print(df.head())

# Save preprocessed dataset
df.to_csv("titanic_labeled.csv", index=False)
print("\n✅ Labeled dataset saved as 'titanic_labeled.csv'")



Data types after encoding:
Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int64
dtype: object

Final preprocessed dataset:
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  34.5      0      0   7.8292         1
1         1       3    0  47.0      1      0   7.0000         2
2         0       2    1  62.0      0      0   9.6875         1
3         0       3    1  27.0      0      0   8.6625         2
4         1       3    0  22.0      1      1  12.2875         2

✅ Labeled dataset saved as 'titanic_labeled.csv'
