## 2.1 DataFrame 创建

In [1]:
import numpy as np
import pandas as pd

#### 1. 通过 dict 创建 DataFrame

##### 1.1 通过通过 dict list(dict 中的 value 为 list) 创建 DataFrame，dict 的 key 作为columns，value 作为 values

In [5]:
data = {
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "city": ["New York", "Los Angeles", "Chicago"]
}
df_1 = pd.DataFrame(data) # 此时没有指定 index，默认0...n-1
df_1

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


##### 1.2 通过 dict of list 创建 DataFrame，指定 index

In [4]:
data = {
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "city": ["New York", "Los Angeles", "Chicago"]
}
df_2 = pd.DataFrame(data, index=["S001", "S002", "S003"]) # 指定 index
df_2

Unnamed: 0,name,age,city
S001,Alice,25,New York
S002,Bob,30,Los Angeles
S003,Charlie,35,Chicago


##### 1.3 通过 dict of Series 创建 DataFrame，Series 的 index 作为 DataFrame 的 index(自动对齐索引), columns 由 dict 的 key 组成

In [6]:
s_1 = pd.Series(["Alice", "Bob", "Charlie"], index=["S001", "S002", "S003"])
s_2 = pd.Series([25, 30, 35], index=["S001", "S002", "S003"])
s_3 = pd.Series(["New York", "Los Angeles", "Chicago"], index=["S001", "S002", "S003"])
df_3 = pd.DataFrame({
    "name": s_1,
    "age": s_2,
    "city": s_3
})
df_3

Unnamed: 0,name,age,city
S001,Alice,25,New York
S002,Bob,30,Los Angeles
S003,Charlie,35,Chicago


#### 2. 通过 list(二维列表) 创建 DataFrame, 此时没有index和columns属性

In [7]:
list_1 = [
    ["Alice", 25, "New York"],
    ["Bob", 30, "Los Angeles"],
    ["Charlie", 35, "Chicago"]
]
df_4 = pd.DataFrame(list_1, columns = ["name", "age", "city"], index=["S001", "S002", "S003"])
df_4

Unnamed: 0,name,age,city
S001,Alice,25,New York
S002,Bob,30,Los Angeles
S003,Charlie,35,Chicago


#### 3. 通过 list of dict(list 中的每个元素为一个 dict) 创建 DataFrame， 此时没有 index 属性，dict 中的key作为 columns，value 作为 values

In [8]:
list_2 = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 30, "city": "Los Angeles"},
    {"name": "Charlie", "age": 35, "city": "Chicago"}
]
df_5 = pd.DataFrame(list_2, index=["S001", "S002", "S003"])
df_5

Unnamed: 0,name,age,city
S001,Alice,25,New York
S002,Bob,30,Los Angeles
S003,Charlie,35,Chicago


#### 4. 通过 ndarray 创建 DataFrame，适合数值型数据，需要手动添加 columns 和 index

In [9]:
arr_1 = np.random.rand(3,3)
df_6 = pd.DataFrame(arr_1, columns=["A", "B", "C"], index=["S001", "S002", "S003"])
df_6

Unnamed: 0,A,B,C
S001,0.343424,0.685204,0.28889
S002,0.082865,0.952777,0.113709
S003,0.208071,0.521991,0.832192


#### 5. 通过文件读取的方式创建 DataFrame，常见的文件格式有 CSV, Excel, JSON 等

In [12]:
df_7 = pd.read_csv("../../02 Programming for Data Science/LO3/autompg.csv")
df_7.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


#### 6. 创建完成之后查看 DataFrame 的基本信息

In [15]:
print(df_7.head())
print(df_7.tail())
print(df_7.info())
print(df_7.shape)
print(df_7.columns)
print(df_7.index)

    mpg  cyl  displ   hp  weight  accel  yr  origin                       name
0  18.0    8  307.0  130    3504   12.0  70       1  chevrolet chevelle malibu
1  15.0    8  350.0  165    3693   11.5  70       1          buick skylark 320
2  18.0    8  318.0  150    3436   11.0  70       1         plymouth satellite
3  16.0    8  304.0  150    3433   12.0  70       1              amc rebel sst
4  17.0    8  302.0  140    3449   10.5  70       1                ford torino
      mpg  cyl  displ  hp  weight  accel  yr  origin             name
387  27.0    4  140.0  86    2790   15.6  82       1  ford mustang gl
388  44.0    4   97.0  52    2130   24.6  82       2        vw pickup
389  32.0    4  135.0  84    2295   11.6  82       1    dodge rampage
390  28.0    4  120.0  79    2625   18.6  82       1      ford ranger
391  31.0    4  119.0  82    2720   19.4  82       1       chevy s-10
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #