In [6]:
import pandas as pd
df = pd.read_csv("/Users/stu/Desktop/Python Exercise/data/property-data.csv")
df.head()

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,--
2,100003000.0,,LEXINGTON,N,,1.0,850
3,100004000.0,201.0,BERKELEY,12,1.0,,700
4,,203.0,BERKELEY,Y,3.0,2.0,1600


上表包含了四种空数据：
* n/a
* NA
* —
* na

```
DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
参数说明：

axis：默认为 0，表示逢空值剔除整行，如果设置参数 axis＝1 表示逢空值去掉整列。
how：默认为 'any' 如果一行（或一列）里任何一个数据有出现 NA 就去掉整行，如果设置 how='all' 一行（或列）都是 NA 才去掉这整行。
thresh：设置需要多少非空值的数据才可以保留下来的。
subset：设置想要检查的列。如果是多个列，可以使用列名的 list 作为参数。
inplace：如果设置 True，将计算得到的值直接覆盖之前的值并返回 None，修改的是源数据。
```

In [7]:
print(df["NUM_BEDROOMS"])
print(df["NUM_BEDROOMS"].isnull())

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8     na
Name: NUM_BEDROOMS, dtype: object
0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8    False
Name: NUM_BEDROOMS, dtype: bool


# 读取时设置缺失值

In [8]:
missing_values = ["n/a","na",'--']
df = pd.read_csv("/Users/stu/Desktop/Python Exercise/data/property-data.csv", na_values = missing_values)

In [9]:
print(df["NUM_BEDROOMS"])
print(df["NUM_BEDROOMS"].isnull())

0    3.0
1    3.0
2    NaN
3    1.0
4    3.0
5    NaN
6    2.0
7    1.0
8    NaN
Name: NUM_BEDROOMS, dtype: float64
0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
Name: NUM_BEDROOMS, dtype: bool


## 删除 dropna

In [13]:
import pandas as pd
df = pd.read_csv("/Users/stu/Desktop/Python Exercise/data/property-data.csv")
df_new = df.dropna()
df_new

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
8,100009000.0,215.0,TREMONT,Y,na,2.0,1800


In [12]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PID           1 non-null      float64
 1   ST_NUM        1 non-null      float64
 2   ST_NAME       1 non-null      object 
 3   OWN_OCCUPIED  1 non-null      object 
 4   NUM_BEDROOMS  1 non-null      float64
 5   NUM_BATH      1 non-null      object 
 6   SQ_FT         1 non-null      float64
dtypes: float64(4), object(3)
memory usage: 64.0+ bytes


In [20]:
df_string = df_new.to_string()
# 已经是string类型了
# 字符串

print(df_string)
type(df_string)

           PID  ST_NUM    ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0     PUTNAM            Y            3        1  1000
1  100002000.0   197.0  LEXINGTON            N            3      1.5    --
8  100009000.0   215.0    TREMONT            Y           na        2  1800


str

In [21]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [22]:
# 移除ST_NUM NaN index
df_ST_NUM = df.dropna(subset = ["ST_NUM"])
df_ST_NUM

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1.0,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2.0,1600
5,100006000.0,207.0,BERKELEY,Y,,1.0,800
7,100008000.0,213.0,TREMONT,Y,1,1.0,
8,100009000.0,215.0,TREMONT,Y,na,2.0,1800


# Fillna

In [27]:
import pandas as pd
df = pd.read_csv('/Users/stu/Desktop/Python Exercise/data/property-data.csv')
df.fillna(12345, inplace = True)
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html?highlight=fillna#pandas.DataFrame.fillna
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,12345.0,LEXINGTON,N,12345,1,850
3,100004000.0,201.0,BERKELEY,12,1,12345,700
4,12345.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,12345,1,800
6,100007000.0,12345.0,WASHINGTON,12345,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,12345
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [28]:
# 使用 12345 替换 PID 为空数据：
import pandas as pd
df = pd.read_csv('/Users/stu/Desktop/Python Exercise/data/property-data.csv')
df["PID"].fillna(12345,inplace = True)
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,12345.0,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


### Pandas使用 mean()、median() 和 mode() 方法计算列的均值（所有值加起来的平均值）、中位数值（排序后排在中间的数）和众数（出现频率最高的数）。

In [32]:
# 使用 12345 替换 PID 为空数据：
import pandas as pd
df = pd.read_csv('/Users/stu/Desktop/Python Exercise/data/property-data.csv')
print(df)
zhongshu = df["ST_NUM"].mode()

df["ST_NUM"].fillna(zhongshu,inplace = True)
print(df)

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5    --
2  100003000.0     NaN   LEXINGTON            N          NaN        1   850
3  100004000.0   201.0    BERKELEY           12            1      NaN   700
4          NaN   203.0    BERKELEY            Y            3        2  1600
5  100006000.0   207.0    BERKELEY            Y          NaN        1   800
6  100007000.0     NaN  WASHINGTON          NaN            2   HURLEY   950
7  100008000.0   213.0     TREMONT            Y            1        1   NaN
8  100009000.0   215.0     TREMONT            Y           na        2  1800
           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5    --
2  100003000

# 清洗错误格式
1. 日期类型

In [35]:
import pandas as pd

# 第三个日期格式错误
data = {
  "Date": ['2020/12/01', '2020/12/02' , '20201226'],
  "duration": [50, 40, 45]
}


df = pd.DataFrame(data)
df.index = ["day1", "day2", "day3"]
df

Unnamed: 0,Date,duration
day1,2020/12/01,50
day2,2020/12/02,40
day3,20201226,45


In [38]:
df["Date"] = pd.to_datetime(df["Date"])
df

Unnamed: 0,Date,duration
day1,2020-12-01,50
day2,2020-12-02,40
day3,2020-12-26,45


In [43]:
import pandas as pd

person = {
  "name": ['Google', 'Runoob' , 'Taobao'],
  "age": [50, 40, 12345]    # 12345 年龄数据是错误的
}

df = pd.DataFrame(person)
df.loc[2,"age"] = 30 # 修改具体位置
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,40
2,Taobao,30


In [50]:
import pandas as pd

person = {
  "name": ['Google', 'Runoob' , 'Taobao'],
  "age": [50, 40, 12345]    # 12345 年龄数据是错误的
}

df = pd.DataFrame(person)
print(df.index)
print(df.columns)

RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age'], dtype='object')


In [48]:
for x in df.index:
    if df.loc[x, "age"] >= 50:
        df.loc[x, "age"] = 50
print(df.to_string())

     name  age
0  Google   50
1  Runoob   40
2  Taobao   50


In [52]:
import pandas as pd

person = {
  "name": ['Google', 'Runoob' , 'Taobao'],
  "age": [50, 40, 12345]    # 12345 年龄数据是错误的
}

df = pd.DataFrame(person)
print(df.index)
print(df.columns)



for x in df.index:
    if df.loc[x, "age"] >= 120:
        df.drop(x,inplace = True)
print(df.to_string())

RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age'], dtype='object')
     name  age
0  Google   50
1  Runoob   40


# 清洗重复数据

* 如果我们要清洗重复数据，可以使用 duplicated() 和 drop_duplicates() 方法。

* 如果对应的数据是重复的，duplicated() 会返回 True，否则返回 False。

In [53]:
import pandas as pd

person = {
  "name": ['Google', 'Runoob', 'Runoob', 'Taobao'],
  "age": [50, 40, 40, 23]  
}
df = pd.DataFrame(person)

df.duplicated()

0    False
1    False
2     True
3    False
dtype: bool

In [54]:
import pandas as pd

person = {
  "name": ['Google', 'Runoob', 'Runoob', 'Taobao'],
  "age": [50, 40, 40, 23]  
}
df = pd.DataFrame(person)
df.drop_duplicates(inplace = True)
df

Unnamed: 0,name,age
0,Google,50
1,Runoob,40
3,Taobao,23
