### Table of Contents



#### 1. [The Basics](#content1)

#### 2. [Creating DataFrame](#content2)

#### 3. [Treating null values](#content3)

#### 4. [Modify/Add new column(s).](#content4)

#### 5. [Deleting columns](#content5)

#### 6. [Renaming columns](#content6)

#### 7.i. [Slicing DataFrame](#content7)

#### 7.ii. [Slicing using iloc and loc](#content8)

#### 8. [Adding a row](#content9)

#### 9. [Dropping row(s)](#content10)

#### 10. [Sorting](#content11)

#### 11. [Joins](#content12)

#### 12. [Groupby](#content13)

In [178]:
# import libraries
import pandas as pd
import numpy as np

In [179]:
# importing the data
df = pd.read_csv("titanic.csv")


<a id="content1"></a>
## 1. The Basics

In [180]:
# see the first 5 rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [181]:
# last 5 rows
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [182]:
# shape of the dataframe
# (n_samples, x features)
df.shape

(891, 12)

In [183]:
# list all the columns
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [184]:
# rows index
df.index

RangeIndex(start=0, stop=891, step=1)

In [185]:
# value with their counts in a particular columns
df.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [186]:
# General description of the dataset
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


<a id='content2'></a>
## 2. Creating DataFrame

In [187]:
# create an empty data frame
df_empty = pd.DataFrame()
df_empty.head()

In [188]:
# From dict
student_info = {'Name':['Nishant','Sandeep','Ranjeev','Baaje','Rahmat'],
                'Age':[22,23,24,45,25],
                'Subject':['Data science','AI','CITS','civil','CSE']}

df_student = pd.DataFrame(student_info).reset_index(drop=True)
df_student.head()

Unnamed: 0,Name,Age,Subject
0,Nishant,22,Data science
1,Sandeep,23,AI
2,Ranjeev,24,CITS
3,Baaje,45,civil
4,Rahmat,25,CSE


<a id='content3'></a>
## 3. Treating null values

In [189]:
# check if the dataframe has null value or not
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [190]:
## On a particular columns
df.Age.isna().sum()

177

Null value imputation

In [191]:
df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Age"].isna().sum()

0

<a id="content4"></a>
## 4. Modify/Add new columns

In [192]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [193]:
## Male:0, female: 1

df["Sex"] = df["Sex"].map({"male":0,"female":1})
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [194]:
### Finding First Name and Last name from Name columns

df["last_name"] = df["Name"].apply(lambda x: x.split(',')[0])
df["first_name"] = df["Name"].apply(lambda x: ' '.join(x.split(',')[1:]))


In [195]:
# sets to 1 for men in 3rd class
df['Third&Men'] = df.apply(lambda row: int(row['Pclass']==3 and row['Sex']=='0'), axis=1)

In [196]:
def findAgeGroup(age):
    if age < 18:
        return 1
    elif age >=18 and age<=40:
        return 2
    elif age >=40 and age <60:
        return 3
    else:
        return 4
df['Age_group'] = df['Age'].apply(lambda x: findAgeGroup(x))


In [197]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,first_name,Third&Men,Age_group
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr. Owen Harris,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer),0,2
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss. Laina,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs. Jacques Heath (Lily May Peel),0,2
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,Allen,Mr. William Henry,0,2


<a id='content5'></a>
## 5. Deleting columns

In [198]:
df = df.drop(['PassengerId'], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,first_name,Third&Men,Age_group
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr. Owen Harris,0,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer),0,2
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss. Laina,0,2
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs. Jacques Heath (Lily May Peel),0,2
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,Allen,Mr. William Henry,0,2


<a id='content6'></a>
## 6.Renaming columns

In [199]:
df = df.rename({'Sex':'Gender','Name':'Full Name', 'last_name':'SurName','first_name':'Name'})

In [200]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,first_name,Third&Men,Age_group
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr. Owen Harris,0,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer),0,2
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss. Laina,0,2
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,Futrelle,Mrs. Jacques Heath (Lily May Peel),0,2
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,Allen,Mr. William Henry,0,2


<a id='content7'></a>
## 7. Slicing DataFrame

In [201]:
# all rows with pclass 3

df_third_class = df[df['Pclass']==3].reset_index(drop=True)


In [203]:
# Females with age > 60
df_aged = df[(df['Sex']=='1') & (df['Age'] > 60)].reset_index(drop=True)
df_aged

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,first_name,Third&Men,Age_group


In [204]:
# selecting some columns

df1 = df[['Age','Sex','Ticket']]

In [207]:
# select numerial columns only
numerics = ['int16','int32','int64','float16','float32','float64']
df_num = df.select_dtypes(numerics)
df_num.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Third&Men,Age_group
0,0,3,0,22.0,1,0,7.25,0,2
1,1,1,1,38.0,1,0,71.2833,0,2
2,1,3,1,26.0,0,0,7.925,0,2
3,1,1,1,35.0,1,0,53.1,0,2
4,0,3,0,35.0,0,0,8.05,0,2


In [210]:
# categorical
df_cat = df.select_dtypes('object')
df_cat.head()

Unnamed: 0,Name,Ticket,Cabin,Embarked,last_name,first_name
0,"Braund, Mr. Owen Harris",A/5 21171,,S,Braund,Mr. Owen Harris
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599,C85,C,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,"Heikkinen, Miss. Laina",STON/O2. 3101282,,S,Heikkinen,Miss. Laina
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803,C123,S,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,"Allen, Mr. William Henry",373450,,S,Allen,Mr. William Henry


<a id="content8"></a>
## 7.ii Slicing using iloc and loc

In [215]:
# first 100 rows and all cols

df_hund = df.iloc[:100,:]
df_hund.shape

(100, 15)

In [216]:
# first 250 rows with a subset of columns
df_sub = df.iloc[:250,1:8]

In [218]:
df_sub.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket
0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599
2,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803
4,3,"Allen, Mr. William Henry",0,35.0,0,0,373450


In [221]:
df_sub4 = df[(df["Sex"]) & (df["Age"]>50)]
df_sub4

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,last_name,first_name,Third&Men,Age_group
11,1,1,"Bonnell, Miss. Elizabeth",1,58.0,0,0,113783,26.55,C103,S,Bonnell,Miss. Elizabeth,0,3
15,1,2,"Hewlett, Mrs. (Mary D Kingcome)",1,55.0,0,0,248706,16.0,,S,Hewlett,Mrs. (Mary D Kingcome),0,3
195,1,1,"Lurette, Miss. Elise",1,58.0,0,0,PC 17569,146.5208,B80,C,Lurette,Miss. Elise,0,3
268,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",1,58.0,0,1,PC 17582,153.4625,C125,S,Graham,Mrs. William Thompson (Edith Junkins),0,3
275,1,1,"Andrews, Miss. Kornelia Theodosia",1,63.0,1,0,13502,77.9583,D7,S,Andrews,Miss. Kornelia Theodosia,0,4
366,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",1,60.0,1,0,110813,75.25,D37,C,Warren,Mrs. Frank Manley (Anna Sophia Atkinson),0,4
483,1,3,"Turkula, Mrs. (Hedwig)",1,63.0,0,0,4134,9.5875,,S,Turkula,Mrs. (Hedwig),0,4
496,1,1,"Eustis, Miss. Elizabeth Mussey",1,54.0,1,0,36947,78.2667,D20,C,Eustis,Miss. Elizabeth Mussey,0,3
513,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",1,54.0,1,0,PC 17603,59.4,,C,Rothschild,Mrs. Martin (Elizabeth L. Barrett),0,3
571,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",1,53.0,2,0,11769,51.4792,C101,S,Appleton,Mrs. Edward Dale (Charlotte Lamson),0,3
