# 相似度筛选

In [1]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,QuantileTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

In [2]:
# 加载数据
df = pd.read_csv(r"F:\cache_data\frequency_filter\gl\soil_type_point.csv")

In [3]:
df.fillna(df.select_dtypes(include='number').mean(),inplace=True)

In [None]:
df.columns

In [5]:
# 区分点位
no_calc_df = df[df['label']=='inner']
train_df = df[df['label']=='fish_net']

In [6]:
feature_columns = ['analyticalhillshading', 'aspect',
       'channelnetworkbaselevel', 'channelnetworkdistance', 
       'convergenceindex',  'dem', 'dissimilarity', 'dl', 'dz',
       'entropy', 'etp2022', 'etp20221', 'etp202210', 'etp202211', 'etp202212',
       'etp20222', 'etp20223', 'etp20224', 'etp20225', 'etp20226', 'etp20227',
       'etp20228', 'etp20229', 'etp2022mean', 'evi', 'gl_slope_101',
       'lat', 'lon', 'lsfactor', 'lswi', 'mean', 'mndwi',
       'mrrtf', 'mrvbf', 'ndmi', 'ndvi', 'ndwi', 'night2022', 'pca1', 'pca2',
       'plancurvature', 'pre2022', 'pre20221', 'pre202210', 'pre202211',
       'pre202212', 'pre20222', 'pre20223', 'pre20224', 'pre20225', 'pre20226',
       'pre20227', 'pre20228', 'pre20229', 'pre2022mean', 'profilecurvature',
       'relativeslopeposition', 'savi',  'slope', 'tmp2022',
       'tmp20221', 'tmp202210', 'tmp202211', 'tmp202212', 'tmp20222',
       'tmp20223', 'tmp20224', 'tmp20225', 'tmp20226', 'tmp20227', 'tmp20228',
       'tmp20229', 'tmp2022mean', 'topographicwetnessindex',
       'totalcatchmentarea', 'valleydepth', 'vari']

In [7]:
# 标签值
label_column = 'TZ'

In [None]:
# 获取标签的唯一值
label_values = train_df[label_column].unique()

# 初始化用于存储代表性样本的列表
representative_samples = []

# 设置相似度阈值
threshold = 0.80  # 根据需要调整

# 遍历每个标签值
for label_value in label_values:
    # 提取当前标签值的样本
    label_df = train_df[train_df[label_column] == label_value]
    
    # 检查样本数量
    num_samples = len(label_df)
    if num_samples == 1:
        # 如果样本数量为1，直接添加该样本
        representative_samples.append(label_df.iloc[0].values)
    else:
        # 对特征进行标准化
        scaler = StandardScaler()
        # scaler = MinMaxScaler()
        # scaler = MaxAbsScaler()
        # scaler = QuantileTransformer()
        features_scaled = scaler.fit_transform(label_df[feature_columns])

        # 计算样本间的余弦相似度
        similarity_matrix = cosine_similarity(features_scaled)

        # 初始化用于存储选定样本索引的集合
        selected_samples = set()

        # 遍历相似度矩阵，选择相似度高于阈值的样本
        for i in range(len(similarity_matrix)):
            for j in range(i + 1, len(similarity_matrix)):
                if similarity_matrix[i, j] > threshold:
                    selected_samples.add(i)
                    selected_samples.add(j)

        if len(selected_samples) == 0:
            print(label_value, f"共有样本{label_df.shape[0]}个","没有找到相似的样本")
            # 如果selected_samples为空，添加相似度最高的15%的样本
            num_to_add = math.ceil(0.75 * num_samples)
            if num_to_add > 0:
                top_indices = similarity_matrix.sum(axis=0).argsort()[::-1][:num_to_add]
                selected_samples.update(top_indices)

        # 从原始样本中提取选定的样本
        selected_samples_indices = label_df.index[list(selected_samples)]
        representative_samples.extend(df.loc[selected_samples_indices].values)

# 转换为 DataFrame 并显示部分结果
representative_samples_df = pd.DataFrame(representative_samples, columns=df.columns)


In [None]:
len(representative_samples_df)

In [None]:
# 检查类别是否丢失
old_type = train_df[label_column].value_counts()
new_type = representative_samples_df[label_column].value_counts()
old_type.shape,new_type.shape

In [11]:
# 再合并
result_df = pd.concat([representative_samples_df,no_calc_df],ignore_index=True)
# result_df = result_df.drop(columns=['label'])

In [None]:
result_df

In [None]:
result_df['NEW_TZ'].value_counts().to_dict()

In [14]:
out_path = r"F:\cache_data\frequency_filter\dy\scaler_csv"
result_df.to_csv(os.path.join(out_path,'dy_stander_filter_all_type_20240417.csv'),index=False)

In [20]:
df_filter = pd.read_csv(r"F:\cache_data\frequency_filter\dy\scaler_csv\dy_stander_filter_all_type_20240417.csv")

# 分类系统更新

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from collections import Counter
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Load data
df = pd.read_csv(r"C:\Users\Runker\Desktop\test\csv\point_sample.csv")
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)

# Separate points
no_calc_df = df[df['label'] == 'inner']
train_df = df[df['label'] == 'fish_net']

feature_columns = [
                   'aligned_Channel Network Base Level',
                   'aligned_Channel Network Distance', 
                'aligned_dem', 
                   'aligned_ETP2022_mean', 'aligned_evi', 'aligned_LS-Factor',
                    'aligned_ndvi',
                   'aligned_ndwi', 'aligned_NIGHT2022', 'aligned_pca_1', 
                 'aligned_PRE2022_mean',
                    'aligned_Relative Slope Position',
                   'aligned_savi', 'aligned_Slope', 'aligned_TMP2022_mean',
                   'aligned_Topographic Wetness Index', 'aligned_Total Catchment Area',
                   'aligned_Valley Depth', 'aligned_vari', 'clipped_dem','MRRTF', 'MRVBF', 'slope_postion_101']
label_column = 'NEW_TZ'

# Feature selection using mutual information
mi_scores = mutual_info_classif(train_df[feature_columns], train_df[label_column])
mi_scores = pd.Series(mi_scores, index=feature_columns)
selected_features = mi_scores.nlargest(15).index.tolist()

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_df[selected_features])

# PCA
pca = PCA(n_components=0.80)
X_pca = pca.fit_transform(X_scaled)

# Adaptive sampling strategy
class_counts = Counter(train_df[label_column])
sampling_strategy = {cls: max(50, count) for cls, count in class_counts.items()}

# Apply random over-sampling
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_pca, train_df[label_column])

# Function to select representative samples
def select_representative_samples(X, y, n_clusters=5):
    if len(X) <= n_clusters:
        return list(zip(X, y))
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X)
    
    representatives = []
    for cluster in range(n_clusters):
        cluster_points = X[clusters == cluster]
        cluster_labels = y[clusters == cluster]
        if len(cluster_points) > 0:
            center = cluster_points.mean(axis=0)
            distances = np.sum((cluster_points - center) ** 2, axis=1)
            representative_idx = np.argmin(distances)
            representatives.append((cluster_points[representative_idx], cluster_labels.iloc[representative_idx]))
        else:
            # If the cluster is empty, select a random point from X
            random_idx = np.random.randint(len(X))
            representatives.append((X[random_idx], y.iloc[random_idx]))
    
    return representatives

# Select representative samples for each class
representative_samples = []
for label_value in np.unique(y_resampled):
    X_label = X_resampled[y_resampled == label_value]
    y_label = pd.Series(y_resampled[y_resampled == label_value])
    n_clusters = min(5, len(X_label))
    representatives = select_representative_samples(X_label, y_label, n_clusters=n_clusters)
    representative_samples.extend(representatives)

# Convert to DataFrame
representative_samples_df = pd.DataFrame([sample[0] for sample in representative_samples], 
                                         columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
representative_samples_df[label_column] = [sample[1] for sample in representative_samples]

# Inverse transform PCA and scaling
X_original = scaler.inverse_transform(pca.inverse_transform(representative_samples_df.drop(label_column, axis=1)))
representative_samples_df[selected_features] = X_original

# Merge with no_calc_df
result_df = pd.concat([representative_samples_df, no_calc_df], ignore_index=True)

# Save results
# out_path = r"F:\cache_data\frequency_filter\dy\scaler_csv"
# result_df.to_csv(os.path.join(out_path, 'dy_optimized_filter_all_type_20240417.csv'), index=False)

# Output statistics
print(f"Original training samples: {len(train_df)}")
print(f"Representative samples after filtering: {len(representative_samples_df)}")
print(f"Final result samples (including inner points): {len(result_df)}")

# Check if any categories are lost
old_type = train_df[label_column].value_counts()
new_type = representative_samples_df[label_column].value_counts()
print(f"Original categories: {len(old_type)}, Categories after filtering: {len(new_type)}")

# Evaluate the quality of the representative samples
rf_original = RandomForestClassifier(random_state=42)
original_scores = cross_val_score(rf_original, train_df[selected_features], train_df[label_column], cv=5)

rf_filtered = RandomForestClassifier(random_state=42)
filtered_scores = cross_val_score(rf_filtered, representative_samples_df[selected_features], 
                                  representative_samples_df[label_column], cv=5)

print(f"Original data cross-validation score: {np.mean(original_scores):.4f} (+/- {np.std(original_scores) * 2:.4f})")
print(f"Filtered data cross-validation score: {np.mean(filtered_scores):.4f} (+/- {np.std(filtered_scores) * 2:.4f})")

# Visualization (if needed)
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=train_df[label_column].astype('category').cat.codes, alpha=0.5)
plt.title('Data Distribution after PCA')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar(label='Soil Type Category')
plt.show()

In [None]:
result_df['TZ'].value_counts()

In [38]:
# 字典
json_file_path = 'D:\worker_code\Terrain_Test\data\soil_dict\soil_dict.json'
# 读取字典
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    loaded_dict = json.load(json_file)

In [39]:
# 表
df_path = r"F:\cache_data\frequency_filter\dy\scaler_csv\dy_stander_filter_all_type.csv"
df = pd.read_csv(df_path)

In [40]:
# 获取当前的土种信息
tz_list = list(df['TZ'].unique())

In [None]:
# 检查tz_list中的土种是否在对照字典中
check_list = [x in loaded_dict for x in tz_list]
in_dict = check_list.count(True)
not_in_dict = check_list.count(False)
in_dict,not_in_dict

In [42]:
# 获取现有数据土种字典
tz_dict = df.groupby('TZ').apply(lambda x: x[['TL', 'YL', 'TS']].drop_duplicates().to_dict(orient='records')).to_dict()


In [46]:

dfs = []

# 遍历字典，将每个分组的数据转换为 DataFrame 并添加到列表中
for tz, records in tz_dict.items():
    # 将 records 转换为 DataFrame
    df_temp = pd.DataFrame(records)
    # 添加 'TZ' 列，并设置值为当前的 tz
    df_temp['TZ'] = tz
    # 将 df_temp 添加到列表中
    dfs.append(df_temp)

# 使用 pd.concat 将所有的 DataFrame 连接起来
df_new = pd.concat(dfs, ignore_index=True)

# 重置索引
df_new.reset_index(drop=True, inplace=True)


In [48]:
df_new.to_excel(r"C:\Users\Runker\Desktop\test2.xlsx",index=False)

In [None]:
import warnings
features = result_df.drop(columns=['TZ'])
# 禁用 FutureWarning 类型的警告
warnings.simplefilter(action='ignore', category=FutureWarning)

# 执行代码

# 恢复警告设置
# warnings.resetwarnings()

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# 对特征进行标准化
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# 应用 PCA
pca = PCA(n_components=30)
pca_result = pca.fit_transform(features_scaled)

# 应用 t-SNE，明确设置 init 为 'pca'
tsne = TSNE(n_components=2, random_state=0, init='pca', learning_rate='auto')  # 显式设置 init 和 learning_rate
tsne_result = tsne.fit_transform(features_scaled)

# 可视化 PCA 和 t-SNE 的结果
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5)
plt.title('PCA Result')

plt.subplot(1, 2, 2)
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], alpha=0.5)
plt.title('t-SNE Result')

plt.show()

# 返回 PCA 和 t-SNE 结果以供进一步分析
pca_result, tsne_result


In [None]:
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# 应用 PCA
pca = PCA(n_components=2)  # 使用两个主成分
X_pca = pca.fit_transform(X_scaled)

# 找到在第一主成分上具有最大和最小投影值的样本索引
first_pc = X_pca[:, 0]
representative_sample_indices = np.argpartition(first_pc, [0, -1])[:2]

# 选取具有代表性的样本
representative_samples = features.iloc[representative_sample_indices]

representative_samples

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity


# 应用 PCA
pca = PCA(n_components=2)  # 降至2维进行可视化
pca_result = pca.fit_transform(features)

# 计算样本在 PCA 降维后的空间中的坐标
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

# 计算样本的密度
kde = KernelDensity(bandwidth=2)  # 根据需要调整 bandwidth
kde.fit(pca_result)

# 计算密度得分
density_scores = kde.score_samples(pca_result)

# 选择高密度样本
threshold = np.percentile(density_scores, 90)  # 根据需要调整阈值
high_density_samples = df[density_scores >= threshold]

# 显示高密度样本
print(high_density_samples)


In [None]:
# from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,QuantileTransformer
# label_column = 'TL'

# # 获取标签的唯一值
# label_values = df[label_column].unique()

# # 初始化用于存储代表性样本的列表
# representative_samples = []

# # 设置相似度阈值
# threshold = 0.9  # 根据需要调整

# # 遍历每个标签值
# for label_value in label_values:
#     # 提取当前标签值的样本
#     label_df = df[df[label_column] == label_value]
#     # 对特征进行标准化
#     scaler = StandardScaler()
#     # scaler = MinMaxScaler()
#     # scaler = MaxAbsScaler()
#     # scaler = QuantileTransformer()
#     features_scaled = scaler.fit_transform(label_df[feature_columns])

#     # 计算样本间的余弦相似度
#     similarity_matrix = cosine_similarity(features_scaled)
#     print(similarity_matrix)
#     # 初始化用于存储选定样本索引的集合
#     selected_samples = set()

#     # 遍历相似度矩阵，选择相似度高于阈值的样本
#     for i in range(len(similarity_matrix)):
#         for j in range(i + 1, len(similarity_matrix)):
#             if similarity_matrix[i, j] > threshold:
#                 selected_samples.add(i)
#                 selected_samples.add(j)

#     # 从原始样本中提取选定的样本
#     selected_samples_indices = label_df.index[list(selected_samples)]
#     representative_samples.extend(df.loc[selected_samples_indices].values)

# # 转换为 DataFrame 并显示部分结果
# representative_samples_df = pd.DataFrame(representative_samples, columns=df.columns)

