In [1]:
import numpy as np
import pandas as pd

## Basic data structures in pandas

- Series: a one-dimensional labeled array holding data of any type
such as integers, strings, Python objects etc

- DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns..ns.

In [2]:
s = pd.Series([1,3,5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.545794,0.517737,-0.310385,-0.378463
2013-01-02,-0.726781,0.466654,-0.750876,0.570763
2013-01-03,0.192811,-1.622321,0.975008,0.671184
2013-01-04,-1.004398,-0.776126,-1.707817,-1.162016
2013-01-05,-0.586459,-0.177316,-0.951355,-0.500664
2013-01-06,1.234784,-0.127779,-0.817377,-1.390276


In [5]:
# Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.
df2 = pd.DataFrame({
    "A": 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Viewing data

Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame respectively:df

In [7]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.545794,0.517737,-0.310385,-0.378463
2013-01-02,-0.726781,0.466654,-0.750876,0.570763
2013-01-03,0.192811,-1.622321,0.975008,0.671184
2013-01-04,-1.004398,-0.776126,-1.707817,-1.162016
2013-01-05,-0.586459,-0.177316,-0.951355,-0.500664


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.004398,-0.776126,-1.707817,-1.162016
2013-01-05,-0.586459,-0.177316,-0.951355,-0.500664
2013-01-06,1.234784,-0.127779,-0.817377,-1.390276


In [9]:
# Display the DataFrame.index or DataFrame.columns:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
# Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:
df.to_numpy()

array([[ 0.54579437,  0.51773736, -0.31038481, -0.37846303],
       [-0.72678083,  0.46665385, -0.75087636,  0.57076253],
       [ 0.19281119, -1.62232078,  0.97500786,  0.67118372],
       [-1.00439762, -0.77612555, -1.70781679, -1.16201639],
       [-0.58645887, -0.17731554, -0.95135492, -0.50066438],
       [ 1.23478402, -0.12777878, -0.81737665, -1.39027562]])

# Catatan
Array NumPy memiliki satu dtype untuk seluruh array sedangkan pandas DataFrames memiliki satu dtype per kolom . Saat Anda memanggil DataFrame.to_numpy(), panda akan menemukan dtype NumPy yang dapat menampung semua dtype di DataFrame. Jika tipe data yang umum adalah object, DataFrame.to_numpy()akan memerlukan penyalinan data.

In [12]:
# contoh
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [13]:
# akan melakukan penyalinan data
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [14]:
# describe()menunjukkan ringkasan statistik singkat data Anda:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.057375,-0.286525,-0.5938,-0.364912
std,0.862666,0.809358,0.892751,0.854908
min,-1.004398,-1.622321,-1.707817,-1.390276
25%,-0.6917,-0.626423,-0.91786,-0.996678
50%,-0.196824,-0.152547,-0.784127,-0.439564
75%,0.457549,0.318046,-0.420508,0.333456
max,1.234784,0.517737,0.975008,0.671184


In [15]:
# Transposisi data Anda:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.545794,-0.726781,0.192811,-1.004398,-0.586459,1.234784
B,0.517737,0.466654,-1.622321,-0.776126,-0.177316,-0.127779
C,-0.310385,-0.750876,0.975008,-1.707817,-0.951355,-0.817377
D,-0.378463,0.570763,0.671184,-1.162016,-0.500664,-1.390276


In [16]:
# DataFrame.sort_index()mengurutkan berdasarkan axis:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.378463,-0.310385,0.517737,0.545794
2013-01-02,0.570763,-0.750876,0.466654,-0.726781
2013-01-03,0.671184,0.975008,-1.622321,0.192811
2013-01-04,-1.162016,-1.707817,-0.776126,-1.004398
2013-01-05,-0.500664,-0.951355,-0.177316,-0.586459
2013-01-06,-1.390276,-0.817377,-0.127779,1.234784


In [17]:
# DataFrame.sort_values() sorts by values:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.192811,-1.622321,0.975008,0.671184
2013-01-04,-1.004398,-0.776126,-1.707817,-1.162016
2013-01-05,-0.586459,-0.177316,-0.951355,-0.500664
2013-01-06,1.234784,-0.127779,-0.817377,-1.390276
2013-01-02,-0.726781,0.466654,-0.750876,0.570763
2013-01-01,0.545794,0.517737,-0.310385,-0.378463


# Selection
Meskipun ekspresi standar Python/NumPy untuk pemilihan dan pengaturan bersifat intuitif dan berguna untuk pekerjaan interaktif, untuk kode produksi, kami merekomendasikan metode akses data pandas yang dioptimalkan, DataFrame.at(), DataFrame.iat(), DataFrame.loc()dan DataFrame.iloc().

In [18]:
# Getitem([])
# For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A:
df["A"]

2013-01-01    0.545794
2013-01-02   -0.726781
2013-01-03    0.192811
2013-01-04   -1.004398
2013-01-05   -0.586459
2013-01-06    1.234784
Freq: D, Name: A, dtype: float64

In [19]:
# For a DataFrame, passing a slice : selects matching rows:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.545794,0.517737,-0.310385,-0.378463
2013-01-02,-0.726781,0.466654,-0.750876,0.570763
2013-01-03,0.192811,-1.622321,0.975008,0.671184


In [20]:
# contoh lainnya
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,-0.726781,0.466654,-0.750876,0.570763
2013-01-03,0.192811,-1.622321,0.975008,0.671184
2013-01-04,-1.004398,-0.776126,-1.707817,-1.162016


# Selection by label

See more in Selection by Label using DataFrame.loc() or DataFrame.at().

Selecting a row matching a label:

In [21]:
df.loc[dates[0]]

A    0.545794
B    0.517737
C   -0.310385
D   -0.378463
Name: 2013-01-01 00:00:00, dtype: float64

In [22]:
df.loc[:,["A", "B"]]

Unnamed: 0,A,B
2013-01-01,0.545794,0.517737
2013-01-02,-0.726781,0.466654
2013-01-03,0.192811,-1.622321
2013-01-04,-1.004398,-0.776126
2013-01-05,-0.586459,-0.177316
2013-01-06,1.234784,-0.127779


In [23]:
df.loc["20130102":"20130104", ["A","B"]]

Unnamed: 0,A,B
2013-01-02,-0.726781,0.466654
2013-01-03,0.192811,-1.622321
2013-01-04,-1.004398,-0.776126


In [24]:
df.loc[dates[0],["A"]]

A    0.545794
Name: 2013-01-01 00:00:00, dtype: float64

## Selection by position
See more in Selection by Position using DataFrame.iloc() or DataFrame.iat().

Select via the position of the passed integers:

In [25]:
df.iloc[3]

A   -1.004398
B   -0.776126
C   -1.707817
D   -1.162016
Name: 2013-01-04 00:00:00, dtype: float64

In [29]:
# Integer slices acts similar to NumPy/Python:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.004398,-0.776126
2013-01-05,-0.586459,-0.177316


In [30]:
# Lists of integer position locations:
df.iloc[[1,2,4],[0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.726781,-0.750876
2013-01-03,0.192811,0.975008
2013-01-05,-0.586459,-0.951355


In [34]:
# For slicing rows explicitly:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.726781,0.466654,-0.750876,0.570763
2013-01-03,0.192811,-1.622321,0.975008,0.671184


In [33]:
# For slicing columns explicitly:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.517737,-0.310385
2013-01-02,0.466654,-0.750876
2013-01-03,-1.622321,0.975008
2013-01-04,-0.776126,-1.707817
2013-01-05,-0.177316,-0.951355
2013-01-06,-0.127779,-0.817377


In [36]:
# For getting fast access to a scalar (equivalent to the prior method):
df.iat[1,1]

0.4666538504696217

## Boolean indexing

Select rows where df.A is greater than 0.