In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [9]:
df = pd.DataFrame({
    "name": ["a", "b", "c"],
    "age": [10, 20, 30],
    "gender": ["male", "female", "male"],
    "city": ["a", "b", "c"]
})
df.head()


Unnamed: 0,name,age,gender,city
0,a,10,male,a
1,b,20,female,b
2,c,30,male,c


In [10]:
df['age']

0    10
1    20
2    30
Name: age, dtype: int64

In [11]:
ages = pd.Series([1, 2, 3], name= "age")
ages


0    1
1    2
2    3
Name: age, dtype: int64

In [12]:
df['age'].max()

np.int64(30)

In [13]:
ages.max()

np.int64(3)

In [14]:
df.describe()

Unnamed: 0,age
count,3.0
mean,20.0
std,10.0
min,10.0
25%,15.0
50%,20.0
75%,25.0
max,30.0


In [16]:
titanic = pd.read_csv("titanic_sample_data.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1,Passenger_1,male,20,0,0,TK_001,10.5,C123,S
1,2,1,2,Passenger_2,male,25,1,1,TK_002,20.0,,C
2,3,0,3,Passenger_3,male,30,2,0,TK_003,30.5,B456,Q
3,4,1,1,Passenger_4,male,35,0,1,TK_004,40.0,,S
4,5,0,2,Passenger_5,male,40,1,0,TK_005,50.5,A789,C


In [18]:
titanic.dtypes


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age              int64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [20]:
titanic.to_excel('titanic_sample_data.xlsx', sheet_name='passengers', index=False)

In [21]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  51 non-null     int64  
 1   Survived     51 non-null     int64  
 2   Pclass       51 non-null     int64  
 3   Name         51 non-null     object 
 4   Sex          51 non-null     object 
 5   Age          51 non-null     int64  
 6   SibSp        51 non-null     int64  
 7   Parch        51 non-null     int64  
 8   Ticket       51 non-null     object 
 9   Fare         51 non-null     float64
 10  Cabin        31 non-null     object 
 11  Embarked     51 non-null     object 
dtypes: float64(1), int64(6), object(5)
memory usage: 4.9+ KB


In [26]:
ages = titanic['Age']
ages.head()

0    20
1    25
2    30
3    35
4    40
Name: Age, dtype: int64

In [27]:
type(titanic['Age'])

pandas.core.series.Series

In [29]:
titanic['Age'].shape

(51,)

In [30]:
age_sex = titanic[['Age', 'Sex']]
age_sex.head()

Unnamed: 0,Age,Sex
0,20,male
1,25,male
2,30,male
3,35,male
4,40,male


In [31]:
age_sex.shape

(51, 2)

In [32]:
above_35 = titanic[titanic['Age'] > 35]
above_35.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,5,0,2,Passenger_5,male,40,1,0,TK_005,50.5,A789,C
5,6,1,3,Passenger_6,male,45,2,1,TK_006,60.0,C123,Q
6,7,0,1,Passenger_7,male,50,0,0,TK_007,70.5,,S
7,8,1,2,Passenger_8,male,55,1,1,TK_008,80.0,B456,C
8,9,0,3,Passenger_9,male,60,2,0,TK_009,90.5,,Q


# Filter the titanic dataset to include only passengers from 2nd and 3rd class
# This creates a subset containing passengers where Pclass is either 2 or 3

In [33]:
class_23 = titanic[titanic['Pclass'].isin([2, 3])]
class_23.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,2,Passenger_2,male,25,1,1,TK_002,20.0,,C
2,3,0,3,Passenger_3,male,30,2,0,TK_003,30.5,B456,Q
4,5,0,2,Passenger_5,male,40,1,0,TK_005,50.5,A789,C
5,6,1,3,Passenger_6,male,45,2,1,TK_006,60.0,C123,Q
7,8,1,2,Passenger_8,male,55,1,1,TK_008,80.0,B456,C


In [None]:
## Filtering Data with Non-Null Age Values

 In this section, we'll filter the Titanic dataset to include only passengers who have valid age information. This is important for data analysis as missing age values can affect statistical calculations and visualizations.

### Steps:
1. Filter out rows where the 'Age' column contains null/NaN values
2. Display the first few rows to verify the filtering worked correctly

In [35]:
age_not_null = titanic[titanic['Age'].notna()]
age_not_null


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1,Passenger_1,male,20,0,0,TK_001,10.5,C123,S
1,2,1,2,Passenger_2,male,25,1,1,TK_002,20.0,,C
2,3,0,3,Passenger_3,male,30,2,0,TK_003,30.5,B456,Q
3,4,1,1,Passenger_4,male,35,0,1,TK_004,40.0,,S
4,5,0,2,Passenger_5,male,40,1,0,TK_005,50.5,A789,C
5,6,1,3,Passenger_6,male,45,2,1,TK_006,60.0,C123,Q
6,7,0,1,Passenger_7,male,50,0,0,TK_007,70.5,,S
7,8,1,2,Passenger_8,male,55,1,1,TK_008,80.0,B456,C
8,9,0,3,Passenger_9,male,60,2,0,TK_009,90.5,,Q
9,10,1,1,Passenger_10,male,65,0,1,TK_010,100.0,A789,S


In [39]:
adult_name = titanic.loc[titanic['Age'] > 35, 'Name']
adult_name.head()

4    Passenger_5
5    Passenger_6
6    Passenger_7
7    Passenger_8
8    Passenger_9
Name: Name, dtype: object

# Select a specific subset of the titanic dataset using integer-based indexing
# This extracts rows 9 through 24 (inclusive) and columns 2 through 4 (inclusive)
# iloc[row_start:row_end, column_start:column_end] - uses zero-based indexing
titanic.iloc[9:25, 2:5]

In [40]:
titanic.iloc[9:25, 2:5]

Unnamed: 0,Pclass,Name,Sex
9,1,Passenger_10,male
10,2,Passenger_11,male
11,3,Passenger_12,male
12,1,Passenger_13,male
13,2,Passenger_14,male
14,3,Passenger_15,male
15,1,Passenger_16,male
16,2,Passenger_17,male
17,3,Passenger_18,male
18,1,Passenger_19,male


In [None]:
titanic = pd.read_csv("titanic_sample_data.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1,Passenger_1,male,20,0,0,TK_001,10.5,C123,S
1,2,1,2,Passenger_2,male,25,1,1,TK_002,20.0,,C
2,3,0,3,Passenger_3,male,30,2,0,TK_003,30.5,B456,Q
3,4,1,1,Passenger_4,male,35,0,1,TK_004,40.0,,S
4,5,0,2,Passenger_5,male,40,1,0,TK_005,50.5,A789,C


In [42]:
titanic.iloc[0, 3] = "anonymous"
anon = titanic.iloc[0, 3]
anon

'anonymous'

In [43]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1,anonymous,male,20,0,0,TK_001,10.5,C123,S
1,2,1,2,Passenger_2,male,25,1,1,TK_002,20.0,,C
2,3,0,3,Passenger_3,male,30,2,0,TK_003,30.5,B456,Q
3,4,1,1,Passenger_4,male,35,0,1,TK_004,40.0,,S
4,5,0,2,Passenger_5,male,40,1,0,TK_005,50.5,A789,C


In [44]:
titanic['Age'].mean()

np.float64(42.09803921568628)

In [46]:
titanic[['Age', 'Fare']].median()

Age     40.0
Fare    50.5
dtype: float64

In [47]:
titanic[['Age', 'Fare']].describe()

Unnamed: 0,Age,Fare
count,51.0,51.0
mean,42.098039,54.460784
std,14.645484,29.228897
min,20.0,10.5
25%,30.0,30.5
50%,40.0,50.5
75%,55.0,80.0
max,65.0,100.0


In [49]:
titanic.agg({
    'Age': ['min', 'max', 'median', 'skew'],
    'Fare': ['min', 'max', 'median', 'mean']
})

Unnamed: 0,Age,Fare
min,20.0,10.5
max,65.0,100.0
median,40.0,50.5
skew,0.028632,
mean,,54.460784


# Calculate the average age by gender in the titanic dataset
# Step 1: Select only 'Sex' and 'Age' columns from the titanic DataFrame
# Step 2: Group the data by 'Sex' (male/female)
# Step 3: Calculate the mean (average) of all numeric columns for each group
titanic[['Sex', 'Age']].groupby('Sex').mean()

In [50]:
titanic[['Sex', 'Age']].groupby('Sex').mean()

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,44.115385
male,40.0


In [51]:
titanic[['Sex', 'Age']].groupby('Sex').max()

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,65
male,65


In [52]:
titanic[['Sex', 'Age']].groupby('Sex').first()

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,45
male,20


In [53]:
titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1,anonymous,male,20,0,0,TK_001,10.5,C123,S
1,2,1,2,Passenger_2,male,25,1,1,TK_002,20.0,,C


In [54]:
titanic.groupby('Sex')['Age'].mean()

Sex
female    44.115385
male      40.000000
Name: Age, dtype: float64

In [55]:
titanic.groupby(['Sex','Pclass'])['Fare'].mean()

Sex     Pclass
female  1         60.250000
        2         58.000000
        3         57.444444
male    1         52.500000
        2         50.250000
        3         47.750000
Name: Fare, dtype: float64

In [56]:
titanic['Pclass'].value_counts()

Pclass
1    17
2    17
3    17
Name: count, dtype: int64

In [57]:
titanic.groupby('Pclass')['Pclass'].count()

Pclass
1    17
2    17
3    17
Name: Pclass, dtype: int64

In [58]:
titanic.sort_values(by='Age', ascending=False).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
9,10,1,1,Passenger_10,male,65,0,1,TK_010,100.0,A789,S
29,30,1,3,Passenger_30,female,65,2,1,TK_030,100.0,A789,Q
19,20,1,2,Passenger_20,male,65,1,1,TK_020,100.0,A789,C
39,40,1,1,Passenger_40,female,65,0,1,TK_040,100.0,A789,S
49,50,1,2,Passenger_50,female,65,1,1,TK_050,100.0,A789,C


In [60]:
titanic.sort_values(by=['Pclass', 'Age'], ascending=False).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
29,30,1,3,Passenger_30,female,65,2,1,TK_030,100.0,A789,Q
8,9,0,3,Passenger_9,male,60,2,0,TK_009,90.5,,Q
38,39,0,3,Passenger_39,female,60,2,0,TK_039,90.5,,Q
17,18,1,3,Passenger_18,male,55,2,1,TK_018,80.0,B456,Q
47,48,1,3,Passenger_48,female,55,2,1,TK_048,80.0,B456,Q


In [62]:
titanic['Name'].str.lower()

0        anonymous
1      passenger_2
2      passenger_3
3      passenger_4
4      passenger_5
5      passenger_6
6      passenger_7
7      passenger_8
8      passenger_9
9     passenger_10
10    passenger_11
11    passenger_12
12    passenger_13
13    passenger_14
14    passenger_15
15    passenger_16
16    passenger_17
17    passenger_18
18    passenger_19
19    passenger_20
20    passenger_21
21    passenger_22
22    passenger_23
23    passenger_24
24    passenger_25
25    passenger_26
26    passenger_27
27    passenger_28
28    passenger_29
29    passenger_30
30    passenger_31
31    passenger_32
32    passenger_33
33    passenger_34
34    passenger_35
35    passenger_36
36    passenger_37
37    passenger_38
38    passenger_39
39    passenger_40
40    passenger_41
41    passenger_42
42    passenger_43
43    passenger_44
44    passenger_45
45    passenger_46
46    passenger_47
47    passenger_48
48    passenger_49
49    passenger_50
50    passenger_51
Name: Name, dtype: object

In [68]:
titanic["Name"].str.split('_')

0         [anonymous]
1      [Passenger, 2]
2      [Passenger, 3]
3      [Passenger, 4]
4      [Passenger, 5]
5      [Passenger, 6]
6      [Passenger, 7]
7      [Passenger, 8]
8      [Passenger, 9]
9     [Passenger, 10]
10    [Passenger, 11]
11    [Passenger, 12]
12    [Passenger, 13]
13    [Passenger, 14]
14    [Passenger, 15]
15    [Passenger, 16]
16    [Passenger, 17]
17    [Passenger, 18]
18    [Passenger, 19]
19    [Passenger, 20]
20    [Passenger, 21]
21    [Passenger, 22]
22    [Passenger, 23]
23    [Passenger, 24]
24    [Passenger, 25]
25    [Passenger, 26]
26    [Passenger, 27]
27    [Passenger, 28]
28    [Passenger, 29]
29    [Passenger, 30]
30    [Passenger, 31]
31    [Passenger, 32]
32    [Passenger, 33]
33    [Passenger, 34]
34    [Passenger, 35]
35    [Passenger, 36]
36    [Passenger, 37]
37    [Passenger, 38]
38    [Passenger, 39]
39    [Passenger, 40]
40    [Passenger, 41]
41    [Passenger, 42]
42    [Passenger, 43]
43    [Passenger, 44]
44    [Passenger, 45]
45    [Pas

In [65]:
titanic['Surname'] = titanic['Name'].str.rsplit('_').str.get(0)
titanic['Surname']

0     anonymous
1     Passenger
2     Passenger
3     Passenger
4     Passenger
5     Passenger
6     Passenger
7     Passenger
8     Passenger
9     Passenger
10    Passenger
11    Passenger
12    Passenger
13    Passenger
14    Passenger
15    Passenger
16    Passenger
17    Passenger
18    Passenger
19    Passenger
20    Passenger
21    Passenger
22    Passenger
23    Passenger
24    Passenger
25    Passenger
26    Passenger
27    Passenger
28    Passenger
29    Passenger
30    Passenger
31    Passenger
32    Passenger
33    Passenger
34    Passenger
35    Passenger
36    Passenger
37    Passenger
38    Passenger
39    Passenger
40    Passenger
41    Passenger
42    Passenger
43    Passenger
44    Passenger
45    Passenger
46    Passenger
47    Passenger
48    Passenger
49    Passenger
50    Passenger
Name: Surname, dtype: object

In [67]:
titanic['Name_main']= titanic['Name'].str.split('_').str.get(1)
titanic['Name_main']

0     NaN
1       2
2       3
3       4
4       5
5       6
6       7
7       8
8       9
9      10
10     11
11     12
12     13
13     14
14     15
15     16
16     17
17     18
18     19
19     20
20     21
21     22
22     23
23     24
24     25
25     26
26     27
27     28
28     29
29     30
30     31
31     32
32     33
33     34
34     35
35     36
36     37
37     38
38     39
39     40
40     41
41     42
42     43
43     44
44     45
45     46
46     47
47     48
48     49
49     50
50     51
Name: Name_main, dtype: object

In [69]:
titanic['Name'].str.split('_')

0         [anonymous]
1      [Passenger, 2]
2      [Passenger, 3]
3      [Passenger, 4]
4      [Passenger, 5]
5      [Passenger, 6]
6      [Passenger, 7]
7      [Passenger, 8]
8      [Passenger, 9]
9     [Passenger, 10]
10    [Passenger, 11]
11    [Passenger, 12]
12    [Passenger, 13]
13    [Passenger, 14]
14    [Passenger, 15]
15    [Passenger, 16]
16    [Passenger, 17]
17    [Passenger, 18]
18    [Passenger, 19]
19    [Passenger, 20]
20    [Passenger, 21]
21    [Passenger, 22]
22    [Passenger, 23]
23    [Passenger, 24]
24    [Passenger, 25]
25    [Passenger, 26]
26    [Passenger, 27]
27    [Passenger, 28]
28    [Passenger, 29]
29    [Passenger, 30]
30    [Passenger, 31]
31    [Passenger, 32]
32    [Passenger, 33]
33    [Passenger, 34]
34    [Passenger, 35]
35    [Passenger, 36]
36    [Passenger, 37]
37    [Passenger, 38]
38    [Passenger, 39]
39    [Passenger, 40]
40    [Passenger, 41]
41    [Passenger, 42]
42    [Passenger, 43]
43    [Passenger, 44]
44    [Passenger, 45]
45    [Pas

In [70]:
titanic['Real_name']= titanic['Name'].str.split('_').str.get(0)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname,Name_main,Real_name
0,1,0,1,anonymous,male,20,0,0,TK_001,10.5,C123,S,anonymous,,anonymous
1,2,1,2,Passenger_2,male,25,1,1,TK_002,20.0,,C,Passenger,2.0,Passenger
2,3,0,3,Passenger_3,male,30,2,0,TK_003,30.5,B456,Q,Passenger,3.0,Passenger
3,4,1,1,Passenger_4,male,35,0,1,TK_004,40.0,,S,Passenger,4.0,Passenger
4,5,0,2,Passenger_5,male,40,1,0,TK_005,50.5,A789,C,Passenger,5.0,Passenger


In [71]:
titanic['Name'].str.contains('Mr')

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
Name: Name, dtype: bool

In [74]:
titanic['Age'] == 30


0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22     True
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32     True
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42     True
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
Name: Age, dtype: bool

In [75]:
titanic['Name'].str.len()

0      9
1     11
2     11
3     11
4     11
5     11
6     11
7     11
8     11
9     12
10    12
11    12
12    12
13    12
14    12
15    12
16    12
17    12
18    12
19    12
20    12
21    12
22    12
23    12
24    12
25    12
26    12
27    12
28    12
29    12
30    12
31    12
32    12
33    12
34    12
35    12
36    12
37    12
38    12
39    12
40    12
41    12
42    12
43    12
44    12
45    12
46    12
47    12
48    12
49    12
50    12
Name: Name, dtype: int64