這段程式碼是利用處理好的output.csv檔案 \
進行KNN的學習使用，其中的演算方法如下：\
1.擷取出資料的特徵部分（去掉names跟output）\
2.進行資料的標準化（採用方法：設定標準差為1、平均為0 \
3.將標準化後的資料進行挖空，並儲存挖出來的值最後答案\
4.製作距離矩陣，計算挖空後的資料中，compound間的距離\
5.利用KNN去評估挖空出來的值\
6.評估模型



In [1]:
#套件下載
!pip install rdkit padelpy
from tqdm import tqdm
from sklearn import decomposition
from rdkit import Chem
from rdkit.Chem import Descriptors
from padelpy import from_smiles
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

Collecting rdkit
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting padelpy
  Downloading padelpy-0.1.14-py2.py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: padelpy, rdkit
Successfully installed padelpy-0.1.14 rdkit-2023.3.3


In [2]:
#讀取資料的特徵部分
data = pd.read_csv('output.csv')
data_charcteristic = data.iloc[:,3:]

#去除有NAN值的所有compound
data_cleaned = data_charcteristic.dropna()

#資料標準化
scaler = StandardScaler()
data_cleaned = scaler.fit_transform(data_cleaned)
columns = data_charcteristic.columns[:]
data_cleaned_df = pd.DataFrame(data_cleaned, columns=columns)

  data = pd.read_csv('output.csv')


In [3]:
# 定義要挖的空格比例（5%）
missing_percentage = 0.05

# 計算要挖的總數
total_missing = int(np.prod(data_cleaned_df.shape) * missing_percentage)

# 在資料中隨機選擇要挖空格的位置
missing_indices = np.random.choice(np.arange(np.prod(data_cleaned_df.shape)), total_missing, replace=False)
missing_indices = np.sort(missing_indices)

# 將選擇的位置設為缺失值
data_flattened = data_cleaned_df.values.flatten('F')
data_flattened[missing_indices] = np.nan

# 紀錄挖出來的值
missing_values = data_cleaned_df.values.flatten('F')[missing_indices]

# 將資料還原為原始形狀
data_with_missing = pd.DataFrame(data_flattened.reshape(data_cleaned_df.shape,order='F'), columns=data_cleaned_df.columns)


In [4]:
# 建立計算分子間距離的演算法
def cal_distance(data1, data2):
    # 找出共同的非 NaN 特徵
    common_features = set(data1.dropna().index) & set(data2.dropna().index)

    if common_features:
        # 將共同特徵轉換為列表
        common_features_list = list(common_features)
        # 計算差值並取平均
        differences = data1[common_features_list] - data2[common_features_list]
        distance = differences.abs().mean()
        return distance
    else:
        return np.nan  # 如果沒有共同的非 NaN 特徵，距離為 NaN



In [5]:
#建立兩兩compound間距離的矩陣
num_samples = len(data_with_missing)
distances_matrix = np.zeros((num_samples, num_samples))

for i in tqdm(range(num_samples)):
    for j in range(i+1,num_samples):
        distances_matrix[i, j] = cal_distance(data_with_missing.iloc[i], data_with_missing.iloc[j])
distances_matrix += distances_matrix.T

np.savetxt('distances_matrix.txt', distances_matrix)
#讀取方式：distances_matrix = np.loadtxt('distances_matrix.txt')

100%|██████████| 751/751 [19:38<00:00,  1.57s/it]


In [6]:
#找出每個compound最接近的num個index
num = 6
sorted_indices = np.argsort(distances_matrix, axis=1)
top_indices = sorted_indices[:, 1:num+1]

In [7]:
# 遍歷每個feature找出NAN的值
predict = []

for feature in data_with_missing.columns:

    nan_indices = data_with_missing.index[data_with_missing[feature].isna()].tolist()

    for index in nan_indices:
        # 獲取最接近的 num 個 index 對應的特徵值
        values = data_with_missing.loc[top_indices[index], feature]
        # 計算總和（忽略 NaN 值）
        total = values.sum(skipna=True)
        # 計算有效值的個數
        count = values.count()
        # 計算平均值作為代表，放到預測結果中
        predict.append(total / count)


In [15]:
#評估預測結果
predict = np.array(predict)
missing_values = np.array(missing_values)
squared_errors = (predict - missing_values) ** 2
mse = np.mean(squared_errors)
rmse = np.sqrt(mse)

print(rmse)

0.6083929013561303


In [18]:
#儲存距離矩陣、預測結果、原始挖空值、data_with_missing
np.savetxt('distances_matrix.txt', distances_matrix)
np.savetxt('predict.txt', predict)
np.savetxt('missing_values.txt', missing_values)
data_with_missing.to_csv('data_with_missing.csv', index=False)
data_cleaned_df.to_csv('data_cleaned_df.csv', index=False)