In [1]:
import numpy as np
import pandas as pd

## Basic data structures in pandas

- Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc

- DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns..ns.

In [2]:
s = pd.Series([1,3,5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.564024,0.592217,-0.274493,1.342779
2013-01-02,-1.155801,1.12379,0.608508,-0.623334
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162
2013-01-04,-0.234764,1.042969,-0.159647,-0.438775
2013-01-05,0.060803,-0.207916,1.111743,-0.966972
2013-01-06,-1.008815,0.777257,0.470818,-1.840966


In [5]:
# Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Viewing data

Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame respectively:df

In [7]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.564024,0.592217,-0.274493,1.342779
2013-01-02,-1.155801,1.12379,0.608508,-0.623334
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162
2013-01-04,-0.234764,1.042969,-0.159647,-0.438775
2013-01-05,0.060803,-0.207916,1.111743,-0.966972


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.234764,1.042969,-0.159647,-0.438775
2013-01-05,0.060803,-0.207916,1.111743,-0.966972
2013-01-06,-1.008815,0.777257,0.470818,-1.840966


In [9]:
# Display the DataFrame.index or DataFrame.columns:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
# Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:
df.to_numpy()

array([[-0.56402419,  0.5922167 , -0.27449301,  1.34277912],
       [-1.15580079,  1.1237899 ,  0.60850818, -0.62333362],
       [-1.27355967, -0.37858289,  1.69230033, -1.16316182],
       [-0.23476372,  1.04296907, -0.1596475 , -0.43877506],
       [ 0.06080284, -0.20791558,  1.1117428 , -0.96697247],
       [-1.0088147 ,  0.77725691,  0.47081753, -1.84096566]])

# Catatan
Array NumPy memiliki satu dtype untuk seluruh array sedangkan pandas DataFrames memiliki satu dtype per kolom . Saat Anda memanggil DataFrame.to_numpy(), panda akan menemukan dtype NumPy yang dapat menampung semua dtype di DataFrame. Jika tipe data yang umum adalah object, DataFrame.to_numpy()akan memerlukan penyalinan data.

In [12]:
# contoh
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [13]:
# akan melakukan penyalinan data
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [14]:
# describe()menunjukkan ringkasan statistik singkat data Anda:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.696027,0.491622,0.574871,-0.615072
std,0.537727,0.639052,0.749637,1.076232
min,-1.27356,-0.378583,-0.274493,-1.840966
25%,-1.119054,-0.007883,-0.002031,-1.114114
50%,-0.786419,0.684737,0.539663,-0.795153
75%,-0.317079,0.976541,0.985934,-0.484915
max,0.060803,1.12379,1.6923,1.342779


In [15]:
# Transposisi data Anda:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.564024,-1.155801,-1.27356,-0.234764,0.060803,-1.008815
B,0.592217,1.12379,-0.378583,1.042969,-0.207916,0.777257
C,-0.274493,0.608508,1.6923,-0.159647,1.111743,0.470818
D,1.342779,-0.623334,-1.163162,-0.438775,-0.966972,-1.840966


In [16]:
# DataFrame.sort_index()mengurutkan berdasarkan axis:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.342779,-0.274493,0.592217,-0.564024
2013-01-02,-0.623334,0.608508,1.12379,-1.155801
2013-01-03,-1.163162,1.6923,-0.378583,-1.27356
2013-01-04,-0.438775,-0.159647,1.042969,-0.234764
2013-01-05,-0.966972,1.111743,-0.207916,0.060803
2013-01-06,-1.840966,0.470818,0.777257,-1.008815


In [17]:
# DataFrame.sort_values() sorts by values:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162
2013-01-05,0.060803,-0.207916,1.111743,-0.966972
2013-01-01,-0.564024,0.592217,-0.274493,1.342779
2013-01-06,-1.008815,0.777257,0.470818,-1.840966
2013-01-04,-0.234764,1.042969,-0.159647,-0.438775
2013-01-02,-1.155801,1.12379,0.608508,-0.623334


# Selection
Meskipun ekspresi standar Python/NumPy untuk pemilihan dan pengaturan bersifat intuitif dan berguna untuk pekerjaan interaktif, untuk kode produksi, kami merekomendasikan metode akses data pandas yang dioptimalkan, DataFrame.at(), DataFrame.iat(), DataFrame.loc()dan DataFrame.iloc().

In [18]:
# Getitem([])
# For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A:
df["A"]

2013-01-01   -0.564024
2013-01-02   -1.155801
2013-01-03   -1.273560
2013-01-04   -0.234764
2013-01-05    0.060803
2013-01-06   -1.008815
Freq: D, Name: A, dtype: float64

In [19]:
# For a DataFrame, passing a slice : selects matching rows:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.564024,0.592217,-0.274493,1.342779
2013-01-02,-1.155801,1.12379,0.608508,-0.623334
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162


In [20]:
# contoh lainnya
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-1.155801,1.12379,0.608508,-0.623334
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162
2013-01-04,-0.234764,1.042969,-0.159647,-0.438775


# Selection by label

See more in Selection by Label using DataFrame.loc() or DataFrame.at().

Selecting a row matching a label:

In [21]:
df.loc[dates[0]]

A   -0.564024
B    0.592217
C   -0.274493
D    1.342779
Name: 2013-01-01 00:00:00, dtype: float64

In [22]:
df.loc[:,["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.564024,0.592217
2013-01-02,-1.155801,1.12379
2013-01-03,-1.27356,-0.378583
2013-01-04,-0.234764,1.042969
2013-01-05,0.060803,-0.207916
2013-01-06,-1.008815,0.777257


In [23]:
df.loc["20130102":"20130104", ["A","B"]]

Unnamed: 0,A,B
2013-01-02,-1.155801,1.12379
2013-01-03,-1.27356,-0.378583
2013-01-04,-0.234764,1.042969


In [24]:
df.loc[dates[0],["A"]]

A   -0.564024
Name: 2013-01-01 00:00:00, dtype: float64

## Selection by position
See more in Selection by Position using DataFrame.iloc() or DataFrame.iat().

Select via the position of the passed integers:

In [25]:
df.iloc[3]

A   -0.234764
B    1.042969
C   -0.159647
D   -0.438775
Name: 2013-01-04 00:00:00, dtype: float64

In [26]:
# Integer slices acts similar to NumPy/Python:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.234764,1.042969
2013-01-05,0.060803,-0.207916


In [27]:
# Lists of integer position locations:
df.iloc[[1,2,4],[0, 2]]

Unnamed: 0,A,C
2013-01-02,-1.155801,0.608508
2013-01-03,-1.27356,1.6923
2013-01-05,0.060803,1.111743


In [28]:
# For slicing rows explicitly:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-1.155801,1.12379,0.608508,-0.623334
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162


In [29]:
# For slicing columns explicitly:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.592217,-0.274493
2013-01-02,1.12379,0.608508
2013-01-03,-0.378583,1.6923
2013-01-04,1.042969,-0.159647
2013-01-05,-0.207916,1.111743
2013-01-06,0.777257,0.470818


In [30]:
# For getting fast access to a scalar (equivalent to the prior method):
df.iat[1,1]

1.1237899024895877

## Boolean indexing

Select rows where df.A is greater than 0.

In [32]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-05,0.060803,-0.207916,1.111743,-0.966972


In [33]:
# Selecting values from a DataFrame where a boolean condition is met:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.592217,,1.342779
2013-01-02,,1.12379,0.608508,
2013-01-03,,,1.6923,
2013-01-04,,1.042969,,
2013-01-05,0.060803,,1.111743,
2013-01-06,,0.777257,0.470818,


In [34]:
# Using isin() method for filtering:
df2 = df.copy()

In [36]:
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.564024,0.592217,-0.274493,1.342779,one
2013-01-02,-1.155801,1.12379,0.608508,-0.623334,one
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162,two
2013-01-04,-0.234764,1.042969,-0.159647,-0.438775,three
2013-01-05,0.060803,-0.207916,1.111743,-0.966972,four
2013-01-06,-1.008815,0.777257,0.470818,-1.840966,three


In [38]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.27356,-0.378583,1.6923,-1.163162,two
2013-01-05,0.060803,-0.207916,1.111743,-0.966972,four


## Setting
Setting a new column automatically aligns the data by the indexes:

In [41]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6)) 
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [42]:
df["F"] = s1

In [45]:
# Setting values by label:
df.at[dates[0], "A"] = 0

In [46]:
# Setting values by position:
df.iat[0, 1] = 0 

In [47]:
# Setting by assigning with a NumPy array:
df.loc[:, "D"] = np.array([5] * len(df))

In [48]:
# The result of the prior setting operations:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.274493,5.0,
2013-01-02,-1.155801,1.12379,0.608508,5.0,1.0
2013-01-03,-1.27356,-0.378583,1.6923,5.0,2.0
2013-01-04,-0.234764,1.042969,-0.159647,5.0,3.0
2013-01-05,0.060803,-0.207916,1.111743,5.0,4.0
2013-01-06,-1.008815,0.777257,0.470818,5.0,5.0


In [50]:
# A 'where' operation with setting:
df2 = df.copy()

In [51]:
df2[df2 > 0] = -df2

In [52]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.274493,-5.0,
2013-01-02,-1.155801,-1.12379,-0.608508,-5.0,-1.0
2013-01-03,-1.27356,-0.378583,-1.6923,-5.0,-2.0
2013-01-04,-0.234764,-1.042969,-0.159647,-5.0,-3.0
2013-01-05,-0.060803,-0.207916,-1.111743,-5.0,-4.0
2013-01-06,-1.008815,-0.777257,-0.470818,-5.0,-5.0


## Missing data
For NumPy data types, np.nan represents missing data. It is by default not included in computations. See the Missing Data section.

Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data:

In [53]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])

In [54]:
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.274493,5.0,,1.0
2013-01-02,-1.155801,1.12379,0.608508,5.0,1.0,1.0
2013-01-03,-1.27356,-0.378583,1.6923,5.0,2.0,
2013-01-04,-0.234764,1.042969,-0.159647,5.0,3.0,


In [55]:
# DataFrame.dropna() drops any rows that have missing data:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-1.155801,1.12379,0.608508,5.0,1.0,1.0


In [56]:
# DataFrame.fillna() fills missing data: 
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.274493,5.0,5.0,1.0
2013-01-02,-1.155801,1.12379,0.608508,5.0,1.0,1.0
2013-01-03,-1.27356,-0.378583,1.6923,5.0,2.0,5.0
2013-01-04,-0.234764,1.042969,-0.159647,5.0,3.0,5.0


In [57]:
# isna() gets the boolean mask where values are nan:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


## Operations
See the Basic section on Binary Ops.

Stats
Operations in general exclude missing data.

Calculate the mean value for each column:

In [58]:
df.mean()

A   -0.602023
B    0.392920
C    0.574871
D    5.000000
F    3.000000
dtype: float64

In [59]:
# Calculate the mean value for each row:
df.mean(axis=1)

2013-01-01    1.181377
2013-01-02    1.315299
2013-01-03    1.408032
2013-01-04    1.729712
2013-01-05    1.992926
2013-01-06    2.047852
Freq: D, dtype: float64

Operating with another Series or DataFrame with a different index or column will align the result with the union of the index or column labels. In addition, pandas automatically broadcasts along the specified dimension and will fill unaligned labels with np.nan.

In [62]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [63]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-2.27356,-1.378583,0.6923,4.0,1.0
2013-01-04,-3.234764,-1.957031,-3.159647,2.0,0.0
2013-01-05,-4.939197,-5.207916,-3.888257,0.0,-1.0
2013-01-06,,,,,


## User defined functions
DataFrame.agg() and DataFrame.transform() applies a user defined function that reduces or broadcasts its result respectively.

In [65]:
df.agg(lambda x: np.mean(x) * 5.6)

A    -3.371327
B     2.200350
C     3.219280
D    28.000000
F    16.800000
dtype: float64

In [66]:
df.transform(lambda x: x * 101.2)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-27.778693,506.0,
2013-01-02,-116.96704,113.727538,61.581028,506.0,101.2
2013-01-03,-128.884239,-38.312588,171.260793,506.0,202.4
2013-01-04,-23.758088,105.54847,-16.156327,506.0,303.6
2013-01-05,6.153247,-21.041057,112.508372,506.0,404.8
2013-01-06,-102.092048,78.6584,47.646734,506.0,506.0


## Value Counts
See more at Histogramming and Discretization.

In [67]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    2
1    1
2    5
3    3
4    5
5    5
6    4
7    4
8    3
9    3
dtype: int32

In [68]:
s.value_counts()

5    3
3    3
4    2
2    1
1    1
Name: count, dtype: int64

## String Methods
Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below. See more at Vectorized String Methods.

In [69]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])

In [70]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# Merge
### Concat
pandas provides various facilities for easily combining together Series and DataFrame objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.

See the Merging section.

Concatenating pandas objects together row-wise with concat():

In [71]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.603041,-0.767368,0.9031,-0.014471
1,-1.994398,0.263681,-0.109741,0.270313
2,1.023035,0.83014,0.370571,0.086388
3,-1.886176,0.216123,-1.301609,-0.254191
4,1.15134,0.707072,-0.824122,1.3027
5,0.062896,-0.496503,0.347349,0.046403
6,0.900474,1.421335,0.854453,-0.708506
7,-0.46437,0.09225,-0.881382,-1.071814
8,-0.99153,-0.125384,0.901953,-1.268564
9,-0.286462,-0.368881,1.416508,1.060081


In [76]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]

pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.603041,-0.767368,0.9031,-0.014471
1,-1.994398,0.263681,-0.109741,0.270313
2,1.023035,0.83014,0.370571,0.086388
3,-1.886176,0.216123,-1.301609,-0.254191
4,1.15134,0.707072,-0.824122,1.3027
5,0.062896,-0.496503,0.347349,0.046403
6,0.900474,1.421335,0.854453,-0.708506
7,-0.46437,0.09225,-0.881382,-1.071814
8,-0.99153,-0.125384,0.901953,-1.268564
9,-0.286462,-0.368881,1.416508,1.060081


# Note

Adding a column to a DataFrame is relatively fast. However, adding a row requires a copy, and may be expensive. We recommend passing a pre-built list of records to the DataFrame constructor instead of building a DataFrame by iteratively appending records to it.