In [1]:
import pandas as pd

In [2]:
titanic_df = pd.read_csv("train.csv")

In [3]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Specific Information

In [4]:
titanic_df[["Age","Fare"]]

Unnamed: 0,Age,Fare
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
886,27.0,13.0000
887,19.0,30.0000
888,,23.4500
889,26.0,30.0000


In [5]:
titanic_df[["Age","Fare"]].mean()

Age     29.699118
Fare    32.204208
dtype: float64

In [6]:
titanic_df[["Age","Fare"]].describe()

Unnamed: 0,Age,Fare
count,714.0,891.0
mean,29.699118,32.204208
std,14.526497,49.693429
min,0.42,0.0
25%,20.125,7.9104
50%,28.0,14.4542
75%,38.0,31.0
max,80.0,512.3292


### Summary of Age and Fare (Titanic Dataset)

The **Age** distribution is approximately normal, centered around **29 years**, with moderate spread. In contrast, the **Fare** distribution is **highly skewed**, showing significant outliers — with some passengers paying over **$500**, far above the average.


### Age and Fare Summary (Titanic Dataset)

- **Age:**
  - Average age is approximately **29.7 years**.
  - Middle 50% of passengers are aged between **20.1** and **38 years**.
  - Youngest passenger is **0.42 years**, and oldest is **80 years**.
  - Standard deviation is **14.5**, indicating moderate spread.
  - **714** non-null age entries (some missing values).

- **Fare:**
  - Average fare is approximately **$32.20**.
  - Middle 50% of fares range between **$7.91** and **$31.00**.
  - Minimum fare is **$0.00**; maximum is a high **$512.33**.
  - High **standard deviation of 49.7**, indicating wide variation.
  - Distribution is **highly skewed**, with a few passengers paying extremely high fares.


In [7]:
titanic_df.value_counts("Survived")

Survived
0    549
1    342
Name: count, dtype: int64

In [8]:
titanic_df["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [9]:
titanic_df.value_counts(["Sex","Survived"])

Sex     Survived
male    0           468
female  1           233
male    1           109
female  0            81
Name: count, dtype: int64

In [10]:
titanic_df.value_counts(["Sex","Survived"], normalize = True, ascending = True)

Sex     Survived
female  0           0.090909
male    1           0.122334
female  1           0.261504
male    0           0.525253
Name: proportion, dtype: float64

Intrestingly, death rate of male population is double the rate of female who survived.

In [11]:
titanic_df[titanic_df["Sex"] == "female"].value_counts("Survived", normalize = True )

Survived
1    0.742038
0    0.257962
Name: proportion, dtype: float64

"The data shows that approximately one-third of the female population survived the accident, while around 25% did not."



In [12]:
titanic_df[titanic_df["Embarked"] == "S"].value_counts("Survived")

Survived
0    427
1    217
Name: count, dtype: int64

In [13]:
titanic_df[titanic_df["Embarked"] == "S"].value_counts("Survived", normalize = True)

Survived
0    0.663043
1    0.336957
Name: proportion, dtype: float64

In [14]:
titanic_df[titanic_df["Embarked"] == "S"].value_counts(["Survived","Sex"])

Survived  Sex   
0         male      364
1         female    140
          male       77
0         female     63
Name: count, dtype: int64

In [16]:
#nunique(), unique()
titanic_df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [17]:
titanic_df["Embarked"].nunique()

3

In [20]:
titanic_df.isna().sum() #<- Finding total null value

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [26]:
titanic_df[["Sex","Fare","Age","Survived"]]

Unnamed: 0,Sex,Fare,Age,Survived
0,male,7.2500,22.0,0
1,female,71.2833,38.0,1
2,female,7.9250,26.0,1
3,female,53.1000,35.0,1
4,male,8.0500,35.0,0
...,...,...,...,...
886,male,13.0000,27.0,0
887,female,30.0000,19.0,1
888,female,23.4500,,0
889,male,30.0000,26.0,1


In [24]:
group_by_gender = titanic_df[["Sex","Fare","Age","Survived"]].groupby(by = "Sex")

In [25]:
group_by_gender.mean()

Unnamed: 0_level_0,Fare,Age,Survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,44.479818,27.915709,0.742038
male,25.523893,30.726645,0.188908


In [27]:
group_by_gender.value_counts(["Survived"])

Sex     Survived
female  1           233
        0            81
male    0           468
        1           109
Name: count, dtype: int64

In [37]:
group_by_embarked = titanic_df[["Embarked","Fare","Age","Survived"]].groupby(by = ["Embarked","Survived"])

In [38]:
group_by_embarked.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fare,Age
Embarked,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1
C,0,35.443335,33.666667
C,1,79.720926,28.973671
Q,0,13.335904,30.325
Q,1,13.182227,22.5
S,0,20.743987,30.203966
S,1,39.547081,28.113184


In [31]:
group_by_embarked.median()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fare,Age
Embarked,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1
C,0,14.4583,30.0
C,1,56.9292,27.0
Q,0,7.75,30.75
Q,1,7.80835,20.5
S,0,10.5,28.0
S,1,26.0,28.0


In [32]:
group_by_embarked.max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fare,Age
Embarked,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1
C,0,247.5208,71.0
C,1,512.3292,60.0
Q,0,90.0,70.5
Q,1,90.0,33.0
S,0,263.0,74.0
S,1,263.0,80.0


In [40]:
group_by_embarked = titanic_df.groupby(by = ["Embarked","Survived"])

In [41]:
group_by_embarked.mean(["Embarked","Fare","Age","Survived"])

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Pclass,Age,SibSp,Parch,Fare
Embarked,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C,0,409.853333,2.2,33.666667,0.253333,0.253333,35.443335
C,1,473.989247,1.634409,28.973671,0.494624,0.451613,79.720926
Q,0,480.404255,2.93617,30.325,0.510638,0.276596,13.335904
Q,1,319.966667,2.866667,22.5,0.3,0.0,13.182227
S,0,449.868852,2.545667,30.203966,0.611241,0.348946,20.743987
S,1,448.857143,1.967742,28.113184,0.493088,0.539171,39.547081


In [42]:
## corelation and covariance

In [43]:
titanic_df[["Age","Fare"]].cov()

Unnamed: 0,Age,Fare
Age,211.019125,73.84903
Fare,73.84903,2469.436846


In [44]:
titanic_df[["Age","Fare"]].corr()

Unnamed: 0,Age,Fare
Age,1.0,0.096067
Fare,0.096067,1.0


In [49]:
titanic_df.sample(n=55, random_state=99)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
247,248,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S
823,824,1,3,"Moor, Mrs. (Beila)",female,27.0,0,1,392096,12.475,E121,S
566,567,0,3,"Stoytcheff, Mr. Ilia",male,19.0,0,0,349205,7.8958,,S
666,667,0,2,"Butler, Mr. Reginald Fenton",male,25.0,0,0,234686,13.0,,S
199,200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24.0,0,0,248747,13.0,,S
480,481,0,3,"Goodwin, Master. Harold Victor",male,9.0,5,2,CA 2144,46.9,,S
86,87,0,3,"Ford, Mr. William Neal",male,16.0,1,3,W./C. 6608,34.375,,S
136,137,1,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,11752,26.2833,D47,S
253,254,0,3,"Lobb, Mr. William Arthur",male,30.0,1,0,A/5. 3336,16.1,,S
809,810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33.0,1,0,113806,53.1,E8,S


In [47]:
sampled = titanic_df.sample(n=55, random_state=99)

In [48]:
sampled[["Age","Fare"]].corr()

Unnamed: 0,Age,Fare
Age,1.0,0.134789
Fare,0.134789,1.0


In [51]:
pd.crosstab(
    titanic_df.Sex,
    titanic_df.Survived,
    margins=True
)

Survived,0,1,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,81,233,314
male,468,109,577
All,549,342,891
