In [1]:
import pandas as pd
import re

In [2]:
data = pd.read_csv("Data.csv", encoding='latin1')

data.columns = data.columns.str.strip()

data['Intake Details_Submission Date'] = pd.to_datetime(data['Intake Details_Submission Date'])
data['Assessment Date & Time'] = pd.to_datetime(data['Assessment Date & Time'])


In [3]:
def clean_text(text):
    if isinstance(text, str):
        # 移除非字母或数字的字符
        text = text.replace("?","")
        # 转为小写
        text = text.lower()
    return text

# 对每一列做处理
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].apply(clean_text)

data

Unnamed: 0,Contact ID,Intake Details_Submission Date,Type of Assistance Applied,Care Team,Gender,Age,Race,Occupation,Housing Type,Home Ownership,...,Copayment,Case Profile,No. of HH,Remarks (No. of HH),Before Primary,Primary (7-12),Secondary (13-17),Tertiary (18-21),Adult (22-64),Elderly (65 and above)
0,883,2020-03-06,education fees assistance,central 1 1,female,13.0,chinese,student,hdb 3 room,lodging,...,,,,,,,,,,
1,910,2021-06-03,interim dialysis assistance,central 1 1,male,57.0,malay,unemployed,hdb 3 room,lodging,...,$-,,,,,,,,,
2,914,2020-12-09,medical consumables assistance,central 1 1,male,32.0,chinese,unemployed,hdb 5 room,"owned, not fully paid",...,,,,,,,,,,
3,917,2020-09-08,household living assistance,central 1 1,male,69.0,indian,,hdb 1 room,"owned, fully paid",...,,,,,,,,,,
4,925,2021-03-04,household living assistance,central 1 1,male,70.0,chinese,,rent a room,rented,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1441,12722,2025-02-05,education fees assistance,south 1 1,male,14.0,chinese,student,hdb 3 room,other,...,,"single parent , schooling children",3.0,,,,1.0,,1.0,1.0
1442,12723,2025-02-05,household living assistance,central 1 1,female,37.0,malay,part-timer art teacher,hdb 4 room,"owned, not fully paid",...,,"renal patient , schooling children , disabilit...",4.0,,1.0,1.0,,,2.0,
1443,12744,2025-02-18,education fees assistance,south 3 3,female,19.0,malay,student,hdb 4 room,"owned, not fully paid",...,,schooling children,7.0,,1.0,1.0,,2.0,3.0,
1444,12755,2025-02-25,medical transport assistance,east 4 4,male,67.0,chinese,retiree,nursing home,other,...,,"living alone elderly , renal patient",1.0,,,,,,,1.0


In [4]:
# 分类计数 Type of Assistance Applied 列
assistance_counts = data['Type of Assistance Applied'].value_counts()

# 打印结果
print(assistance_counts)

Type of Assistance Applied
interim dialysis assistance                                                                     576
medical consumables assistance                                                                  343
household living assistance                                                                     209
medical transport assistance                                                                    203
education fees assistance                                                                        30
hiv medication fees                                                                              19
day care fees assistance                                                                         18
hiv medication fees , merit                                                                      10
antenatal check fees assistance                                                                  10
befriending service                                                      

In [5]:
year_counts = data['Assessment Date & Time'].dt.year.value_counts().sort_index()

print(year_counts)

Assessment Date & Time
2002.0      1
2015.0      2
2017.0      3
2018.0      1
2019.0      2
2020.0     44
2021.0     77
2022.0    247
2023.0    443
2024.0    442
2025.0     62
Name: count, dtype: int64


In [6]:
def clean_dollar(text):
    if isinstance(text, str):
        # 使用正则表达式保留数字和小数点
        text = re.sub(r"[^\d.]", "", text)
    return text


dollar_cols = [
    "Assistance Amount",
    "Copayment",
    "Monthly Rental",
    "I&E Difference (self-declaration)",
    "I&E Difference (assessment)",
    "Invoice Amount",
]

for col in dollar_cols:
    data[col] = data[col].apply(clean_dollar)
    data[col] = pd.to_numeric(data[col])

data.dtypes

Contact ID                                                        int64
Intake Details_Submission Date                           datetime64[ns]
Type of Assistance Applied                                       object
Care Team                                                        object
Gender                                                           object
Age                                                             float64
Race                                                             object
Occupation                                                       object
Housing Type                                                     object
Home Ownership                                                   object
Monthly Mortgage (CPF/Cash)                                      object
Monthly Rental                                                  float64
Identity_ID Type                                                 object
Intake_No. of HH                                                

In [7]:
# 处理 Type of Assistance Applied，合并一些类
data["Type of Assistance Applied"] = data["Type of Assistance Applied"].apply(lambda x:x.split(",")[0].strip())

# 分类计数 Type of Assistance Applied 列
assistance_counts = data['Type of Assistance Applied'].value_counts()

# 打印结果
print(assistance_counts)

Type of Assistance Applied
interim dialysis assistance        578
medical consumables assistance     344
household living assistance        213
medical transport assistance       207
education fees assistance           31
hiv medication fees                 29
day care fees assistance            18
antenatal check fees assistance     11
befriending service                  9
others                               3
one-time assistance                  2
home decluttering service            1
Name: count, dtype: int64


In [8]:
# 处理occupation
occupation = {
    "unemployed": [
        "unfit for work",
        "unemployed",
        "unemployed unfit for work",
        "unemployedunfit for work",
        "unemployedmedically unfit to work",
        "unemployed (medically unfit for work)",
        "unemployed (paraplegic since 2005 accident)",
        "unemployedmedically unfit for work",
        "unempolyed",
        "not employed",
        "not working",
        "nil",
        "unemployed (due to medical conditions)",
        "unemployed/ can collector",
        "unfit for work",
        "not sure",
        "uneployed",
        "unempployed",
        "unemployed due to medical reasons",
        "unemployed (unfit for work)",
        "unemployed (unfit for work)",
        "unemployed (elderly)",
        "unemployed (medically unfit)",
        "unfit to work",
        "unfit for employment",
        "n.a.",
        "unemployed- permantly unfit for work",
        "unemployed due to medical conditions",
        "not been employed for the past 7 years",
    ],
    "retired": [
        "retired",
        "retiree",
        "tetiree",
        "reitree",
        "retired",
        "retired cleaner",
        "na - retiree",
        "retired cleaner",
        "retiree",
    ],
    "part-time": [
        "part timer",
        "part-time actor",
        "part time baker",
        "part time f&b",
        "part-time tutor",
        "part-time lala move driver",
        "part-timer",
        "part-time admin",
        "part-time service crew",
        "part-time security officer",
        "part time macdonald",
        "part time security",
        "part-time cashier",
        "part-time shop assistant",
        "part-time security guard",
        "part-time cleaner",
        "part-time stall assistant",
        "part-time",
        "part-time photo developer",
        "part time kitchen helper",
        "part-timer art teacher",
    ],
    "student": ["student", "nus y2 student"],
}


employed_set = set()


def unify_occupation(text):
    global employed_set
    if pd.isna(text) or text.strip() == "":
        return "NA"
    find = False
    # print(text)
    text = text.strip()
    ans = ""
    for i in occupation:
        if text in occupation[i]:
            find = True
            ans = i
            break

    if find:
        return ans
    else:
        employed_set.add(text)
        return "employed"


data["Occupation"] = data["Occupation"].apply(unify_occupation)

# 分类计数 Type of Assistance Applied 列
occupation_counts = data["Occupation"].value_counts()

# 打印结果
print(occupation_counts)
# for i in employed_set:
#     print(i)

Occupation
unemployed    799
employed      258
retired       230
NA            101
student        34
part-time      24
Name: count, dtype: int64


In [9]:
# drop一些无法使用的列
drop_cols=[
    "Contact ID",
    "Intake Details_Submission Date",
    "Monthly Mortgage (CPF/Cash)",
    "Reason for Rejection or Cancelled",
    "Type of assistances",
    "Assistance Details",
    "Case Profile",
    "Remarks (No. of HH)",
    "Assessment Date & Time",
]

data = data.drop(columns=drop_cols)

In [10]:
data.to_csv("Cleaned_Data.csv", index=False)