**DATA FRAME**

A data frame is a collection of series, with at least one series.

In [10]:
import pandas as pd

In [31]:
dict_pop = {'Indonesia':1500,
            'English':1000,
            'Japan':1100,
            'China':1900,
            'Korea':1300}

# Examples only, not actual population figures

In [32]:
dict_area = {'Indonesia':5000,
              'English':3100,
              'Japan':3500,
              'China':4500,
              'Korea':4200}

# Just an example, not an actual broad

In [34]:
region = pd.DataFrame({'pop':dict_pop, 'Area size':dict_area})

In [35]:
region

Unnamed: 0,pop,Area size
Indonesia,1500,5000
English,1000,3100
Japan,1100,3500
China,1900,4500
Korea,1300,4200


In [36]:
region['pop']

Indonesia    1500
English      1000
Japan        1100
China        1900
Korea        1300
Name: pop, dtype: int64

In [38]:
region['Area size']['Indonesia']

5000

Because pop is the same as the function name in the data frame.

In [39]:
region.pop

<bound method DataFrame.pop of             pop  Area size
Indonesia  1500       5000
English    1000       3100
Japan      1100       3500
China      1900       4500
Korea      1300       4200>

hence it is safer to call data with the syntax region['Population']

In [45]:
region = pd.DataFrame({'Population':dict_pop, 'Area size':dict_area})

In [46]:
region

Unnamed: 0,Population,Area size
Indonesia,1500,5000
English,1000,3100
Japan,1100,3500
China,1900,4500
Korea,1300,4200


In [47]:
region['Population']

Indonesia    1500
English      1000
Japan        1100
China        1900
Korea        1300
Name: Population, dtype: int64

In [50]:
region['Population']['Indonesia':'Japan'] #Explicit index

Indonesia    1500
English      1000
Japan        1100
Name: Population, dtype: int64

In [51]:
region['Population'].iloc[0:3] #Implisit Index

Indonesia    1500
English      1000
Japan        1100
Name: Population, dtype: int64

In [52]:
# Load the csv data

df = pd.read_csv('Titanic.csv')

In [53]:
# View the first 5 rows by default

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [54]:
# Optionally, specific the number of rows to display

df.head(8)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


In [55]:
# Get information about the data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [56]:
# View the non-null count of data

df.notnull().sum()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [57]:
# View the amount of NaN from the data

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [60]:
# View the last 5 rows by default 

df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [61]:
# View the number of rows and columns

df.shape

(891, 12)

In [62]:
# View columns

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [63]:
# View index

df.index

RangeIndex(start=0, stop=891, step=1)

In [64]:
# Get descriptive statistics for numerical columns

df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [68]:
#  Display the average of a column

df['Age'].mean()

29.69911764705882

In [69]:
#  Display the median of a column

df['Age'].median()

28.0

In [70]:
#  Display the mode of a column

df['Age'].mode()[0]

24.0

In [71]:
# Display min from column Age

df['Age'].min()

0.42

In [72]:
# Display max from column Age

df['Age'].max()

80.0

In [73]:
# View the mean of the Age column

df['Age'].mean()

29.69911764705882

In [74]:
# View NaN from column Age

df[df['Age'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [77]:
# Masking

[df.Age.isnull()] # For example looking at the NaN of the Age column

[0      False
 1      False
 2      False
 3      False
 4      False
        ...  
 886    False
 887    False
 888     True
 889    False
 890    False
 Name: Age, Length: 891, dtype: bool]

In [78]:
# Then to the form of the data frame

df[df.Age.isnull()] # For example looking at the NaN of the Age column

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [79]:
# View unique data from the Sex column

df.Sex.unique()

array(['male', 'female'], dtype=object)

In [80]:
# View the number of unique Sex columns

df.Sex.nunique()

2