# Import Library

In [1]:
import pandas as pd
import numpy as np

Pandas has two objects, namely series and dataframe

# Object Series

- Object series has one data dimension.
- It doesn't have a column name because it only has one column.
- And it has an index.

In [2]:
data = [0.25, 0.50, 0.75, 1]

In [3]:
print(data)

[0.25, 0.5, 0.75, 1]


# Converting data into series

In [4]:
data = pd.Series(data)
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

# Convert from series to array

In [5]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

# Displays Index

The index is in the form of a range, where the start point is inclusive of the range and the stop point is exclusive of the range.

In [6]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
list(range(0, 10, 2))

[0, 2, 4, 6, 8]

# Calling Data

In [8]:
data[2]

0.75

- The implicit index is the default index.
- We can define the index ny, this is called an explicit index, namely a defined index

In [9]:
data = pd.Series([0.25, 0.50, 0.75, 1], index=['a', 'b', 'c', 'd'])

In [10]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [11]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [12]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

# Calling explicit and implicit index data

In [13]:
# index eksplicit
data['a']

0.25

- This is data selection.
- Even though we have created an explicit index, we can still call the implicit index

In [14]:
# index implicit
data[3]

1.0

When the implicit index and explicit index are the same, when calling the data it will only rely on the explicit index

In [15]:
data_2 = pd.Series([0.25, 0.50, 0.75, 1], index=[2, 5, 3, 7])

In [16]:
data_2[2]

0.25

In [None]:
data_2[0] # error

We will try to do data slicing

In [17]:
data = pd.Series([0.25, 0.50, 0.75, 1], index=['a', 'b', 'c', 'd'])

In [18]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

for example, we will call from data b to data c

In [19]:
data['b' : 'c'] # index eksplicit

b    0.50
c    0.75
dtype: float64

but if we slice the implicit index, only the starting point will appear because the implicit index is a range

In [20]:
data[1:2] # index implicit

b    0.5
dtype: float64

# loc dan iloc

The main difference between loc and iloc is in the way they access the data. loc uses explicit labels/indices, while iloc uses implicit positions/indices.

When we access an index, what appears is the explicit index

In [21]:
data_2 = pd.Series([0.25, 0.50, 0.75, 1], index=[2, 5, 3, 7])

In [22]:
data_2[2] # index eksplicit : slicing

0.25

When we call the explicit index from index 2 to index 3. The value that appears is precisely from the implicit index

In [23]:
data_2[2:3] # index implicit : slicing

3    0.75
dtype: float64

- When the explicit index and implicit index are the same, inconsistencies will occur as in the case above
- To overcome this inconsistency, we will use the loc and iloc rules
- loc is to call its explicit index
- iloc is for calling its implicit index

In [24]:
# loc
data_2.loc[3] # selecting index eksplicit

0.75

In [25]:
data_2.loc[2:3] # slicing index eksplicit

2    0.25
5    0.50
3    0.75
dtype: float64

In [26]:
# iloc
data_2.iloc[3] # selecting index implicit

1.0

In [27]:
data_2.iloc[2:3] # slicing index implicit

3    0.75
dtype: float64

In [28]:
dict_populasi = {'Jakarta':750, 
                 'Bogor':490,
                 'Depok':350,
                 'Tanggerang':270,
                 'Bekasi':670}

In [29]:
dict_populasi

{'Jakarta': 750, 'Bogor': 490, 'Depok': 350, 'Tanggerang': 270, 'Bekasi': 670}

In [30]:
# dictionary to series transformation
populasi = pd.Series(dict_populasi)

In [31]:
populasi

Jakarta       750
Bogor         490
Depok         350
Tanggerang    270
Bekasi        670
dtype: int64

In [32]:
populasi.loc['Depok']

350

In [33]:
populasi.iloc[2]

350

In [34]:
dict_luas = {'Jakarta':737, 
                 'Bogor':325,
                 'Depok':247,
                 'Tanggerang':302,
                 'Bekasi':355}

In [35]:
luas = pd.Series(dict_luas)

In [36]:
luas

Jakarta       737
Bogor         325
Depok         247
Tanggerang    302
Bekasi        355
dtype: int64

# DataFrame

DataFrame is a collection of series with at least one series

In [37]:
daerah = pd.DataFrame({'pop':populasi, 'luas daerah':luas})

In [38]:
daerah

Unnamed: 0,pop,luas daerah
Jakarta,750,737
Bogor,490,325
Depok,350,247
Tanggerang,270,302
Bekasi,670,355


In [39]:
daerah['luas daerah']

Jakarta       737
Bogor         325
Depok         247
Tanggerang    302
Bekasi        355
Name: luas daerah, dtype: int64

In [40]:
daerah['luas daerah']['Jakarta']

737

When calling data with the regional.pop syntax it will appear as below

In [41]:
daerah.pop

<bound method DataFrame.pop of             pop  luas daerah
Jakarta     750          737
Bogor       490          325
Depok       350          247
Tanggerang  270          302
Bekasi      670          355>

because pop is the same as the function name in the DataFrame

So it is safer to call data with the syntax area ['populasi']

In [42]:
daerah['pop']

Jakarta       750
Bogor         490
Depok         350
Tanggerang    270
Bekasi        670
Name: pop, dtype: int64

Renamed column pop to populasi

In [43]:
daerah = pd.DataFrame({'populasi':populasi, 'luas':luas})

In [44]:
daerah

Unnamed: 0,populasi,luas
Jakarta,750,737
Bogor,490,325
Depok,350,247
Tanggerang,270,302
Bekasi,670,355


In [45]:
daerah['populasi']

Jakarta       750
Bogor         490
Depok         350
Tanggerang    270
Bekasi        670
Name: populasi, dtype: int64

In [46]:
daerah['populasi']['Jakarta':'Depok'] # indeks eksplisit

Jakarta    750
Bogor      490
Depok      350
Name: populasi, dtype: int64

In [47]:
daerah['populasi'].iloc[0:3] # indeks implisit

Jakarta    750
Bogor      490
Depok      350
Name: populasi, dtype: int64

# Load Data

In [48]:
# Load data Titanic
df = pd.read_csv('Titanic.csv')

# Display Data

In [49]:
# Looking at the top data
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [50]:
# Look at the data from the bottom
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


# Viewing Data Info

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Viewing Non Null Counts

In [52]:
# View the number of non-nulls in the data
# Not Null is data that is not empty
df.notnull().sum()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

# Looking at the Number of NaN

In [53]:
# View the number of NaN from the data
# Missing value
# NaN is non-empty data but unknown data
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# View the amount of data

In [54]:
# View the amount of data
# Not recommended because it is not clear what you want to see the amount of
df.sum()

  df.sum()


PassengerId                                               397386
Survived                                                     342
Pclass                                                      2057
Name           Braund, Mr. Owen HarrisCumings, Mrs. John Brad...
Sex            malefemalefemalefemalemalemalemalemalefemalefe...
Age                                                     21205.17
SibSp                                                        466
Parch                                                        340
Ticket         A/5 21171PC 17599STON/O2. 31012821138033734503...
Fare                                                  28693.9493
dtype: object

# View the Number of Rows and Columns

In [55]:
df.shape

(891, 12)

# Look at the column

In [56]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# View Index

In [57]:
df.index

RangeIndex(start=0, stop=891, step=1)

# Describe

In [58]:
# Displays information from columns in the form of numbers
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In the Fare column:

1. Count(Amount of Data) = The total number of data in the Fare column is 891, which means that all 891 passengers have information about ticket prices.
2. Mean(average) = The average ticket price is around 32.20, meaning we take all the passenger ticket prices and add them up and then divide them by the number of passengers, namely 891, we will get this average.
3. The Std on ticket prices is around 49.69, indicating that there is significant variation in ticket prices with some passengers paying far above or below average ticket prices.
4. The lowest min ticket price is 0. Indicates that some passengers may have free tickets or there is missing data.
5. The max highest ticket price is around 512.33 which represents a top class ticket
6. 25%(first quartile) is a value that divides the lowest 25% of ticket prices. The first quartile of ticket prices is around 7.91. This means 25% of passengers paid less than 7.91 for their tickets.
7. 50% (Median / Second Quartile) is the middle value of all ticket prices in the dataset. The median ticket price is around 14.45. This means 50% of passengers pay more than this and the other 50% pay less than this.
8. 75% (Third Quartile) is a value that divides the highest 25% of ticket price data. The third quartile of ticket prices is around 31.00. This means 25% of passengers paid more than 31.00 for their tickets.

This information provides an idea of the variation in ticket prices among Titanic passengers. Can use this information to analyze ticket price distributions, identify passengers with extreme ticket prices, or better understand how ticket costs relate to ticket class or other factors.

# Column Average

In [59]:
df['Age'].mean()

29.69911764705882

# Column Median

In [60]:
df['Age'].median()

28.0

# Column Mode

In [61]:
df['Age'].mode()[0]

24.0

# Minimum column

In [62]:
df['Age'].min()

0.42

# Create a DataFrame from a Series

In [63]:
nilai_mtk = {'Ninys':100, 
             'Cristin':90,
             'Elsha':92,
             'Tria':85}
mtk = pd.Series(nilai_mtk)

In [64]:
nilai_indo = {'Ninys':90, 
             'Cristin':89,
             'Elsha':90,
             'Tria':88}
indo = pd.Series(nilai_indo)

In [65]:
nilai_ipa = {'Ninys':95, 
             'Cristin':92,
             'Elsha':94,
             'Tria':90}
ipa = pd.Series(nilai_ipa)

In [66]:
df1 = pd.DataFrame({'Match':mtk, 'Ind':indo, 'Sains':ipa})
df1

Unnamed: 0,Match,Ind,Sains
Ninys,100,90,95
Cristin,90,89,92
Elsha,92,90,94
Tria,85,88,90


In [67]:
df1['Match']['Ninys':'Elsha']

Ninys      100
Cristin     90
Elsha       92
Name: Match, dtype: int64

In [68]:
df1['Ind'].iloc[1:3]

Cristin    89
Elsha      90
Name: Ind, dtype: int64

# Change column names in a DataFrame

In [69]:
df1.rename(columns={'Match':'Matematika'}, inplace=True)
df1

Unnamed: 0,Matematika,Ind,Sains
Ninys,100,90,95
Cristin,90,89,92
Elsha,92,90,94
Tria,85,88,90


In [70]:
df1.columns = ['Matematika', 'Indonesia', 'Sains']
df1

Unnamed: 0,Matematika,Indonesia,Sains
Ninys,100,90,95
Cristin,90,89,92
Elsha,92,90,94
Tria,85,88,90


In [71]:
df2 = pd.DataFrame({'Matematika':mtk, 'Indonesia':indo, 'Ipa':ipa})
df2

Unnamed: 0,Matematika,Indonesia,Ipa
Ninys,100,90,95
Cristin,90,89,92
Elsha,92,90,94
Tria,85,88,90


In [72]:
df2['Indonesia'].iloc[1:3]

Cristin    89
Elsha      90
Name: Indonesia, dtype: int64

In [73]:
df2['Matematika']

Ninys      100
Cristin     90
Elsha       92
Tria        85
Name: Matematika, dtype: int64

In [74]:
df2['Ipa']['Ninys':'Elsha']

Ninys      95
Cristin    92
Elsha      94
Name: Ipa, dtype: int64