In [9]:
import pandas as pd
import glob

# 指定列名和需要转换为字符串的列
columns_to_str = ["State of Residence Code", "Year Code", "Month Code", "Infant Birth Weight 12 Code"]

# 读取文件并强制指定列为字符串
file_paths = glob.glob('USA/*.txt')  # 替换为实际的文件路径
df_list = [pd.read_csv(file, sep='\t', dtype={col: str for col in columns_to_str}, header=0) for file in file_paths]
df = pd.concat(df_list, ignore_index=True)
# 显示前几行数据确认读取结果
print(df.tail())


      出生 6 月  Notes State of Residence State of Residence Code    Year  \
27429    NaN    NaN            Wyoming                      56  2020.0   
27430    NaN    NaN            Wyoming                      56  2020.0   
27431    NaN    NaN            Wyoming                      56  2020.0   
27432    NaN    NaN            Wyoming                      56  2020.0   
27433    NaN    NaN            Wyoming                      56  2020.0   

      Year Code     Month Month Code Infant Birth Weight 12  \
27429      2020  December         12      2000 - 2499 grams   
27430      2020  December         12      2500 - 2999 grams   
27431      2020  December         12      3000 - 3499 grams   
27432      2020  December         12      3500 - 3999 grams   
27433      2020  December         12      4000 - 4499 grams   

      Infant Birth Weight 12 Code  Births  
27429                          05    20.0  
27430                          06   115.0  
27431                          07   222.0  


In [None]:
# 筛选临海23洲
states = df["State of Residence"].unique().tolist()


selected_states = [
    "Alabama", "California", "Connecticut", "Delaware", 
    "Florida", "Georgia", "Hawaii", "Louisiana", "Maine", 
    "Maryland", "Massachusetts", "Mississippi", "New Hampshire", 
    "New Jersey", "New York", "North Carolina", "Oregon", 
    "Rhode Island", "South Carolina", "Virginia", 
    "Washington"
]
print(len(selected_states))

# selected_states = [
#     "Alabama"
# ]

df = df[df["State of Residence"].isin(selected_states)]
df.tail()

52


Unnamed: 0,出生 6 月,Notes,State of Residence,State of Residence Code,Year,Year Code,Month,Month Code,Infant Birth Weight 12,Infant Birth Weight 12 Code,Births
27156,,,Washington,53,2020.0,2020,December,12,3000 - 3499 grams,7,2429.0
27157,,,Washington,53,2020.0,2020,December,12,3500 - 3999 grams,8,1855.0
27158,,,Washington,53,2020.0,2020,December,12,4000 - 4499 grams,9,552.0
27159,,,Washington,53,2020.0,2020,December,12,4500 - 4999 grams,10,70.0
27160,,,Washington,53,2020.0,2020,December,12,Unknown or Not Stated,12,13.0


In [11]:
# 计算Ratio
df[0:10]
# 定义 weight < 2500 grams 的条件
weight_threshold = [
    "499 grams or less",
   	"500 - 999 grams",
   	"1000 - 1499 grams",
   	"1500 - 1999 grams",
    "2000 - 2499 grams"
]

# 创建一列标记 weight < 2500 grams 的行
df["Weight < 2500"] = df["Infant Birth Weight 12"].isin(weight_threshold)

# 按 State, Year, Month 分组，计算比例
result = (
    df.groupby(["State of Residence", "Year", "Month Code"])
    .apply(lambda group: group.loc[group["Weight < 2500"], "Births"].sum() / group["Births"].sum())
    .reset_index(name="LBW_ratio")
)
df_ratio = result.rename(columns={
	"State of Residence":"state",
	"Year":"birth_year",
	"Month Code":"birth_month"
})
print(df_ratio)

           state  birth_year birth_month  LBW_ratio
0        Alabama      2016.0           1   0.109189
1        Alabama      2016.0          10   0.102740
2        Alabama      2016.0          11   0.107755
3        Alabama      2016.0          12   0.106042
4        Alabama      2016.0           2   0.088479
...          ...         ...         ...        ...
1375  Washington      2020.0           5   0.065539
1376  Washington      2020.0           6   0.066526
1377  Washington      2020.0           7   0.067721
1378  Washington      2020.0           8   0.064163
1379  Washington      2020.0           9   0.061260

[1380 rows x 4 columns]


  .apply(lambda group: group.loc[group["Weight < 2500"], "Births"].sum() / group["Births"].sum())


In [12]:
# 连接州经纬度
dfLL = pd.read_csv("USA/US_States_Coordinates.csv")
dfLL["longitude"] = -dfLL["longitude"]
dfLL["federal district"] = dfLL["federal district"].str.lstrip()
# print(dfLL)
# states = dfLL["federal district"].unique().tolist()
# print(states)
df3 = pd.merge(df_ratio, dfLL, left_on="state", right_on="federal district", how="inner")
df3.drop(columns=["federal district"], inplace=True)
df3.to_csv("USA_LBW.csv")