In [1]:
import pandas as pd

# Pandas Data structures

In [7]:
# Series
# 1D labeled array
# pandas Series can hold any data type
series = pd.Series([i for i in range(10)])
series

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [8]:
series.dtype

dtype('int64')

In [9]:
series.info()

<class 'pandas.core.series.Series'>
RangeIndex: 10 entries, 0 to 9
Series name: None
Non-Null Count  Dtype
--------------  -----
10 non-null     int64
dtypes: int64(1)
memory usage: 208.0 bytes


In [10]:
series.isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [11]:
series.isna()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [12]:
series[1:4]

1    1
2    2
3    3
dtype: int64

In [23]:
series.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [24]:
series = series.astype(str)

In [25]:
series.values

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype=object)

In [29]:
# DataFrame
# 2D labeled data structure
df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [21, 16]})
df

Unnamed: 0,Name,Age
0,Alice,21
1,Bob,16


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    2 non-null      object
 1   Age     2 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes


In [31]:
df.head(1)

Unnamed: 0,Name,Age
0,Alice,21


In [32]:

df.tail(1)

Unnamed: 0,Name,Age
1,Bob,16


In [33]:
df.describe()

Unnamed: 0,Age
count,2.0
mean,18.5
std,3.535534
min,16.0
25%,17.25
50%,18.5
75%,19.75
max,21.0


In [34]:
df['Name'].describe()

count         2
unique        2
top       Alice
freq          1
Name: Name, dtype: object

In [35]:
df['Age'].describe()

count     2.000000
mean     18.500000
std       3.535534
min      16.000000
25%      17.250000
50%      18.500000
75%      19.750000
max      21.000000
Name: Age, dtype: float64

In [37]:
# get number of rows and columns
df.shape

(2, 2)

In [38]:
df.columns

Index(['Name', 'Age'], dtype='object')

In [39]:
df.dtypes

Name    object
Age      int64
dtype: object

In [40]:
df['Age'].astype(float)

0    21.0
1    16.0
Name: Age, dtype: float64

# Read dataset to analyse

In [41]:
# Using titanic data from kaggle: https://www.kaggle.com/c/titanic/data
df = pd.read_csv('data/titanic/train.csv')

# Take a look at the data

In [42]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Describe the data

In [43]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [44]:
df['Name'].describe()

count                     891
unique                    891
top       Dooley, Mr. Patrick
freq                        1
Name: Name, dtype: object

In [45]:
df['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [48]:
# label based
df.loc[0]

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                               22.0
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [52]:
# index based
df.iloc[0, 0] # return 0th row, 0th column

np.int64(1)

In [59]:
df['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

# Handling Missing Data

In [63]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [64]:
df[df.isnull().any(axis=1)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [67]:
# Drop All Rows with Any Missing Value
df.dropna()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [68]:
# fill all missing values with 0
df.fillna(0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,0,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,0,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0.0,1,2,W./C. 6607,23.4500,0,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [71]:
df = df.fillna({'Age': df['Age'].mean(), 'Cabin': 0})

In [72]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       2
dtype: int64

# Analyse titanic dataset

In [73]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [86]:
# above stats tell us that average age in the given dataset is 29.69. 
# Let see what buckets we can put them in to check the counts
ages_20_30 = df[(df['Age'] > 20) & (df['Age'] <= 30)].shape[0]
ages_30_40 = df[(df['Age'] > 30) & (df['Age'] <= 40)].shape[0]
ages_40_50 = df[(df['Age'] > 40) & (df['Age'] <= 50)].shape[0]
ages_50_60 = df[(df['Age'] > 50) & (df['Age'] <= 60)].shape[0]
ages_60_70 = df[(df['Age'] > 60) & (df['Age'] <= 70)].shape[0]
ages_70_80 = df[(df['Age'] > 70) & (df['Age'] <= 80)].shape[0]

print(f'''Ages and counts:
      20-30: {ages_20_30}
      30-40: {ages_30_40}
      40-50: {ages_40_50}
      50-60: {ages_50_60}
      60-70: {ages_60_70}
      70-80: {ages_70_80}
     ''')

Ages and counts:
      20-30: 407
      30-40: 155
      40-50: 86
      50-60: 42
      60-70: 17
      70-80: 5
     


In [88]:
# a more better and optimised way to achieve above result is:
# Define bin edges and labels
bins = [20, 30, 40, 50, 60, 70, 80]
labels = ['20-30', '30-40', '40-50', '50-60', '60-70', '70-80']

# Create age group column
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)

# Count occurrences per age group
age_counts = df['AgeGroup'].value_counts().sort_index()

print("Ages and counts:")
print(age_counts)

# we will learn pd.cut() in detail in next tutorial

Ages and counts:
AgeGroup
20-30    407
30-40    155
40-50     86
50-60     42
60-70     17
70-80      5
Name: count, dtype: int64


In [90]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S,20-30
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,30-40
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S,20-30
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,30-40
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S,30-40
