In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [60]:
# 读取数据
df = pd.read_csv("Cleaned_Data.csv")

# 删除目标变量中为 NaN 的行
df.dropna(subset=['Assistance Amount'], inplace=True)

# 特征和目标变量
X = df.drop(["Assistance Amount", "Recommendation"], axis=1)
y = df['Assistance Amount']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_cols = []
cat_cols = []

for col in X:
    if df[col].dtypes == object:
        cat_cols.append(col)
    else:
        num_cols.append(col)

# 定义预处理步骤
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # 填充缺失值
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ])


max_score = 0
max_i = -1

# 创建 KNN 回归管道
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=9,metric='cosine'))
])

# 训练模型
knn_pipeline.fit(X_train, y_train)

# 评估模型
score = knn_pipeline.score(X_test, y_test)
print(f"R^2 Score: {score}")

R^2 Score: 0.27251432842328005


In [61]:
# 随机抽取一条数据
random_index = np.random.randint(0, X.shape[0])
random_sample = X.iloc[random_index:random_index+1]

# 预处理该数据
preprocessed_sample = preprocessor.transform(random_sample)

# 使用 KNN 进行预测
knn_model = knn_pipeline.named_steps['regressor']
distances, indices = knn_model.kneighbors(preprocessed_sample)

# 输出预测结果
predicted_value = knn_model.predict(preprocessed_sample)
print(f"Predicted Assistance Amount for the random sample: {predicted_value[0]}")

# 查看使用的近邻数据
print("Indices of nearest neighbors:", indices)
print("Distances to nearest neighbors:", distances)

# 输出近邻数据
nearest_neighbors = X_train.iloc[indices[0]]
print("Nearest neighbors data:")
print(nearest_neighbors)


# 获取随机抽取的数据的目标变量值
random_sample_y = y.iloc[random_index:random_index+1]

# 将目标变量值添加到随机抽取的数据中
random_sample_with_y = random_sample.copy()
random_sample_with_y['Assistance Amount'] = random_sample_y.values

# 获取近邻数据的目标变量值
nearest_neighbors_y = y_train.iloc[indices[0]]

# 将目标变量值添加到近邻数据中
nearest_neighbors_with_y = nearest_neighbors.copy()
nearest_neighbors_with_y['Assistance Amount'] = nearest_neighbors_y.values

# 将随机抽取的数据和近邻数据合并
combined_data = pd.concat([random_sample_with_y, nearest_neighbors_with_y], keys=['Random Sample', 'Nearest Neighbors'])

# 将合并后的数据输出为一个 CSV 文件
combined_data.to_csv("combined_sample_and_neighbors_with_y.csv")

Predicted Assistance Amount for the random sample: 266.3733333333333
Indices of nearest neighbors: [[138  39 212 238  34  46 201 191 208]]
Distances to nearest neighbors: [[0.         0.33565715 0.37467852 0.3795278  0.38915658 0.40914481
  0.41457917 0.42023733 0.42569082]]
Nearest neighbors data:
          Type of Assistance Applied    Care Team  Gender    Age      Race  \
1284  medical consumables assistance     east 4 4    male   97.0  chinese    
1288  medical consumables assistance  central 1 1  female  100.0  chinese    
69    medical consumables assistance     east 4 4  female   94.0  chinese    
143   medical consumables assistance    south 2 2  female   80.0  chinese    
922   medical consumables assistance     east 3 3    male   58.0  chinese    
157   medical consumables assistance    south 2 2    male   75.0  chinese    
28    medical consumables assistance  central 2 2  female  100.0  chinese    
760   medical consumables assistance  central 2 2  female   60.0  chinese   