In [6]:
#  Titanic Analysis - Part 1: Dataset Inspection

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('/content/train.csv')

# 1. Display the first and last few rows
print("🔹 First 5 rows:")
print(df.head())
print("\n🔹 Last 5 rows:")
print(df.tail())

# 2. Print dataset shape and column names
print("\n🔹 Dataset Shape:", df.shape)
print("🔹 Column Names:", df.columns.tolist())

# 3. Use info() and describe() for data types and stats
print("\n🔹 Dataset Info:")
df.info()

print("\n🔹 Dataset Description:")
print(df.describe(include='all'))

# 4. Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print("\n🔹 Number of duplicate rows:", duplicate_rows)


🔹 First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   N

In [5]:
from google.colab import files
uploaded = files.upload()


Saving train.csv to train.csv


In [9]:
#  Part 2: Data Exploration

# 1. Show names of the passengers and inspect Name, Sex, and Age
print("🔹 First 5 rows of Name, Sex, Age:")
print(df[['Name', 'Sex', 'Age']].head())

# 2. Use iloc and loc to extract specific rows and ranges
print("\n🔹 Row at index 3 using iloc:")
print(df.iloc[3])

print("\n🔹 Rows 10 to 14 using iloc:")
print(df.iloc[10:15])

print("\n🔹 Passengers with Pclass = 1 using loc:")
print(df.loc[df['Pclass'] == 1].head())

# 3. Unique values in 'Sex' column
print("\n🔹 Unique values in 'Sex':")
print(df['Sex'].unique())

# 4. Count values in 'Embarked' column
print("\n🔹 Value counts for 'Embarked':")
print(df['Embarked'].value_counts())

# 5. Filter passengers older than 60 and show Name, Age, Survived
print("\n🔹 Passengers older than 60:")
print(df.loc[df['Age'] > 60, ['Name', 'Age', 'Survived']])

# 6. Filter all female passengers in 1st class, show Name, Pclass, Sex
print("\n🔹 Female passengers in 1st class:")
print(df.loc[(df['Sex'] == 'female') & (df['Pclass'] == 1), ['Name', 'Sex', 'Pclass']])


🔹 First 5 rows of Name, Sex, Age:
                                                Name     Sex   Age
0                            Braund, Mr. Owen Harris    male  22.0
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0
2                             Heikkinen, Miss. Laina  female  26.0
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0
4                           Allen, Mr. William Henry    male  35.0

🔹 Row at index 3 using iloc:
PassengerId                                               4
Survived                                                  1
Pclass                                                    1
Name           Futrelle, Mrs. Jacques Heath (Lily May Peel)
Sex                                                  female
Age                                                    35.0
SibSp                                                     1
Parch                                                     0
Ticket                                               1

In [7]:
#  Part 3: Handling Missing Data (Updated Version)

# 1. Count missing values per column and calculate percentages
print("🔹 Missing values per column:")
missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_count, 'Percentage': missing_percent})
print(missing_df)

# 2. Drop the  column (too many missing values)
print("\n🔹 Dropping 'Cabin' column...")
##df = df.drop(columns=['Cabin'])

# 3. Drop rows that are completely empty
print("\n🔹 Dropping completely empty rows...")
df = df.dropna(how='all')

# 4. Fill missing 'Embarked' values with the most frequent category
most_common_embarked = df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(most_common_embarked)
print(f"\n🔹 Filled missing 'Embarked' with: {most_common_embarked}")

# 5. Fill missing 'Age' values with the median
median_age = np.nanmedian(df['Age'])
df['Age'] = df['Age'].fillna(median_age)
print(f"🔹 Filled missing 'Age' with median value: {median_age}")


🔹 Missing values per column:
             Missing Count  Percentage
PassengerId              0    0.000000
Survived                 0    0.000000
Pclass                   0    0.000000
Name                     0    0.000000
Sex                      0    0.000000
Age                    177   19.865320
SibSp                    0    0.000000
Parch                    0    0.000000
Ticket                   0    0.000000
Fare                     0    0.000000
Cabin                  687   77.104377
Embarked                 2    0.224467

🔹 Dropping 'Cabin' column...

🔹 Dropping completely empty rows...

🔹 Filled missing 'Embarked' with: S
🔹 Filled missing 'Age' with median value: 28.0


In [8]:
#  Part 4: Statistical Summaries

import numpy as np

# 1. Basic stats for Age and Fare
print("🔹 Age and Fare - Mean, Median, Std:")
print("Age - Mean:", df['Age'].mean())
print("Age - Median:", df['Age'].median())
print("Age - Std Dev:", df['Age'].std())
print("Age - 25th, 50th, 75th Percentiles:", np.percentile(df['Age'], [25, 50, 75]))

print("\nFare - Mean:", df['Fare'].mean())
print("Fare - Median:", df['Fare'].median())
print("Fare - Std Dev:", df['Fare'].std())
print("Fare - 25th, 50th, 75th Percentiles:", np.percentile(df['Fare'], [25, 50, 75]))

# 2. Survival rate by Sex
print("\n🔹 Survival Rate by Sex:")
print(df.groupby('Sex')['Survived'].mean())

# 3. Average Age grouped by Pclass and Sex
print("\n🔹 Average Age by Pclass and Sex:")
print(df.groupby(['Pclass', 'Sex'])['Age'].mean())

# 4. Fare statistics grouped by Embarked
print("\n🔹 Fare stats by Embarked:")
print(df.groupby('Embarked')['Fare'].agg(['mean', 'median', 'min', 'max']))

# 5. Use .agg() on Age and Fare
print("\n🔹 Aggregated stats (Age and Fare):")
print(df[['Age', 'Fare']].agg(['mean', 'median', 'std']))

# 6. Correlation between numeric columns
print("\n🔹 Correlation using Pandas:")
print(df.corr(numeric_only=True))

print("\n🔹 Correlation using NumPy (Age vs Fare):")
age_fare_corr = np.corrcoef(df['Age'], df['Fare'])[0, 1]
print("Correlation between Age and Fare:", age_fare_corr)


🔹 Age and Fare - Mean, Median, Std:
Age - Mean: 29.36158249158249
Age - Median: 28.0
Age - Std Dev: 13.019696550973194
Age - 25th, 50th, 75th Percentiles: [22. 28. 35.]

Fare - Mean: 32.204207968574636
Fare - Median: 14.4542
Fare - Std Dev: 49.693428597180905
Fare - 25th, 50th, 75th Percentiles: [ 7.9104 14.4542 31.    ]

🔹 Survival Rate by Sex:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

🔹 Average Age by Pclass and Sex:
Pclass  Sex   
1       female    33.978723
        male      38.995246
2       female    28.703947
        male      30.512315
3       female    23.572917
        male      26.911873
Name: Age, dtype: float64

🔹 Fare stats by Embarked:
               mean  median     min       max
Embarked                                     
C         59.954144   29.70  4.0125  512.3292
Q         13.276030    7.75  6.7500   90.0000
S         27.243651   13.00  0.0000  263.0000

🔹 Aggregated stats (Age and Fare):
              Age       Fare
mean    29.361

In [10]:
# 🔍 Part 5: Data Filtering

# 1. Passengers who paid fare over 100
print("🔹 Passengers with Fare > 100:")
print(df[df['Fare'] > 100][['Name', 'Fare']].head())

# 2. All female passengers and their survival status
print("\n🔹 Female passengers and survival:")
print(df[df['Sex'] == 'female'][['Name', 'Survived']].head())

# 3. Male passengers in 1st class with fare and age
print("\n🔹 Male passengers in 1st class:")
print(df[(df['Sex'] == 'male') & (df['Pclass'] == 1)][['Name', 'Fare', 'Age']].head())

# 4. Passengers younger than 18 or older than 60
print("\n🔹 Passengers younger than 18 or older than 60:")
print(df[(df['Age'] < 18) | (df['Age'] > 60)][['Name', 'Age']].head())

# 5. Using .query()
print("\n🔹 Passengers who embarked from 'S' and survived:")
print(df.query("Embarked == 'S' and Survived == 1")[['Name', 'Embarked', 'Survived']].head())

print("\n🔹 Passengers in 3rd class who paid over 50:")
print(df.query("Pclass == 3 and Fare > 50")[['Name', 'Pclass', 'Fare']].head())

# 6. Sort passengers by Fare descending
print("\n🔹 Passengers sorted by Fare (descending):")
print(df.sort_values(by='Fare', ascending=False)[['Name', 'Fare']].head())

# 7. Youngest surviving passenger
print("\n🔹 Youngest surviving passenger:")
print(df[df['Survived'] == 1].sort_values(by='Age')[['Name', 'Age', 'Survived']].head(1))

# 8. Boolean mask: Fare > 100 and in 1st class
print("\n🔹 Fare > 100 and Pclass = 1:")
mask = (df['Fare'] > 100) & (df['Pclass'] == 1)
print(df[mask][['Name', 'Fare', 'Pclass']].head())


🔹 Passengers with Fare > 100:
                                               Name      Fare
27                   Fortune, Mr. Charles Alexander  263.0000
31   Spencer, Mrs. William Augustus (Marie Eugenie)  146.5208
88                       Fortune, Miss. Mabel Helen  263.0000
118                        Baxter, Mr. Quigg Edmond  247.5208
195                            Lurette, Miss. Elise  146.5208

🔹 Female passengers and survival:
                                                Name  Survived
1  Cumings, Mrs. John Bradley (Florence Briggs Th...         1
2                             Heikkinen, Miss. Laina         1
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)         1
8  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)         1
9                Nasser, Mrs. Nicholas (Adele Achem)         1

🔹 Male passengers in 1st class:
                              Name      Fare   Age
6          McCarthy, Mr. Timothy J   51.8625  54.0
23    Sloper, Mr. William Thompson   35.5000  28.0

In [11]:
#  Part 6: Feature Engineering

# 1. Create FamilySize = SibSp + Parch + 1
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
print("🔹 FamilySize column:")
print(df[['SibSp', 'Parch', 'FamilySize']].head())

# 2. Create IsAlone = 1 if FamilySize == 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
print("\n🔹 IsAlone column:")
print(df[['FamilySize', 'IsAlone']].head())

# 3. Extract Title from Name
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
print("\n🔹 Title extracted from Name:")
print(df[['Name', 'Title']].head())

# 4. Group rare titles into 'Rare' and normalize others
rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major',
               'Rev', 'Sir', 'Jonkheer', 'Dona']
df['Title'] = df['Title'].replace(rare_titles, 'Rare')
df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
print("\n🔹 Cleaned Title values:")
print(df['Title'].value_counts())

# 5. Create AgeBin using pd.cut()
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 18, 40, 60, 100],
                      labels=['Child', 'Teen', 'Adult', 'MidAge', 'Senior'])
print("\n🔹 AgeBin column:")
print(df[['Age', 'AgeBin']].head())

# 6. Create FareBin using pd.qcut()
df['FareBin'] = pd.qcut(df['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])
print("\n🔹 FareBin column:")
print(df[['Fare', 'FareBin']].head())

# 7. Normalize Fare (Z-score)
df['Fare_Normalized'] = (df['Fare'] - df['Fare'].mean()) / df['Fare'].std()
print("\n🔹 Fare_Normalized column:")
print(df[['Fare', 'Fare_Normalized']].head())

# 8. Encode Sex as binary
df['Sex_Code'] = df['Sex'].map({'male': 0, 'female': 1})
print("\n🔹 Sex_Code column:")
print(df[['Sex', 'Sex_Code']].head())

# 9. One-hot encode Embarked
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, embarked_dummies], axis=1)
print("\n🔹 One-hot encoded Embarked:")
print(embarked_dummies.head())

# 10. Create FarePerPerson = Fare / FamilySize
df['FarePerPerson'] = df['Fare'] / df['FamilySize']
print("\n🔹 FarePerPerson column:")
print(df[['Fare', 'FamilySize', 'FarePerPerson']].head())

# 11. Extract LastName and show repeated ones
df['LastName'] = df['Name'].str.extract(r'(^[A-Za-z]+)', expand=False)
repeated_names = df['LastName'].value_counts()
print("\n🔹 Surnames with more than one occurrence:")
print(repeated_names[repeated_names > 1])


🔹 FamilySize column:
   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1

🔹 IsAlone column:
   FamilySize  IsAlone
0           2        0
1           2        0
2           1        1
3           2        0
4           1        1

🔹 Title extracted from Name:
                                                Name Title
0                            Braund, Mr. Owen Harris    Mr
1  Cumings, Mrs. John Bradley (Florence Briggs Th...   Mrs
2                             Heikkinen, Miss. Laina  Miss
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)   Mrs
4                           Allen, Mr. William Henry    Mr

🔹 Cleaned Title values:
Title
Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: count, dtype: int64

🔹 AgeBin column:
    Age AgeBin
0  22.0  Adult
1  38.0  Adult
2  26.0  Adult
3  35.0  Adult
4  35.0  Adult

🔹 FareBin column:
      Fare   Far

In [None]:

df.to_csv('titanic_cleaned.csv', index=False)

print("✅ File 'titanic_cleaned.csv' has been saved successfully.")


✅ File 'titanic_cleaned.csv' has been saved successfully.


In [None]:
from google.colab import files
files.download('titanic_cleaned.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>