# 数据连接和清洗 基于xjsh_data

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [4]:
# 获取文件路径
def find_file_with_string(path, string):
    for root, dirs, files in os.walk(path):
        for file in files:
            if string in file:
                return os.path.join(root, file)
    return None

In [5]:
# 分析对象路径
folder_path = r'F:\collection_spb_info\GJ\SB'
# 检测信息对象路径
ch_data_path = r'F:\collection_spb_info\XJSH\SB'

In [6]:
# 基础信息表
base_point = find_file_with_string(folder_path, 'base_info')
# 检测信息表
data_point = find_file_with_string(ch_data_path, 'chemical_info')

In [7]:
# 点位信息列表
info_list = ['ydbh','dwjd','dwwd','ydlb']
# 理化指标列表
phy_che_list = ['ydbh','yypbh','yjz','ph','ylzjhl',
                'qdan','qlin','qjia','qxi',
                'yxlin','sxjia','hxjia','yxliu','yxgui','yxtie','yxmeng','yxtong','yxxing','yxpeng','yxmu','zgong','zshen','zqian','zge','zge2','znie',
                'jxzc11','jxzc12','jxzc13','jxzc14','jxzc1trzd']

In [8]:
# 读取信息
df_base = pd.read_excel(base_point,usecols=info_list)
df_data = pd.read_excel(data_point,usecols=phy_che_list)

In [10]:
# 连接数据表
df_result_data = pd.merge(df_base,df_data,on='ydbh',how='left')

In [11]:
# 找出ydbh重复的索引
duplicates = df_result_data.duplicated(subset=['ydbh'], keep=False)
df_duplicates = df_result_data[duplicates]
df_duplicates

Unnamed: 0,ydbh,ydlb,dwjd,dwwd,ph,ylzjhl,yjz,qdan,qlin,qjia,...,zqian,zge,zge2,znie,yypbh,jxzc11,jxzc12,jxzc13,jxzc14,jxzc1trzd
26,5226230101000084,0,108.198749,26.880003,,,,,,,...,,,,,522623010100008430,/,/,/,/,/
27,5226230101000084,0,108.198749,26.880003,7.50,25.1,78.2,4.51,1.34,7.87,...,71.1,0.92,82.2,40.3,522623010100008410,,,,,
33,5226230101000113,0,108.058373,26.970332,7.36,42.6,76.5,4.72,0.99,7.29,...,49.7,0.69,49.7,25.8,522623010100011310,11.5,22.1,48.8,9.8,粉（砂）质壤土
34,5226230101000113,0,108.058373,26.970332,,,,,,,...,,,,,522623010100011330,,,,,
45,5226230101000158,0,108.130606,27.105702,7.25,21.1,78.9,3.48,0.61,8.31,...,47.7,0.82,72.7,27.8,522623010100015811,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1007,5226230307100001,1,108.150422,26.915283,7.24,26.2,18.8,1.50,0.46,44.70,...,/,/,/,/,522623030710000111,1.4,50.2,32.4,16.0,黏壤土
1008,5226230307100001,1,108.150422,26.915283,7.29,27.5,18.2,1.53,0.48,43.50,...,/,/,/,/,522623030710000111,1.2,52.9,30.7,15.2,黏壤土
1023,5226230401100008,1,108.086497,26.987790,5.68,10.7,13.7,0.82,0.38,7.77,...,42.0,0.20,81.6,29.4,522623040110000812,5.6,23.0,26.3,45.1,黏土
1024,5226230401100008,1,108.086497,26.987790,5.96,12.3,24.7,1.43,0.35,8.60,...,42.7,0.22,79.5,29.4,522623040110000811,4.7,22.5,43.3,29.5,壤质黏土


In [None]:
# 去除重复数据
df_result.drop_duplicates(subset=['ydbh'], keep='first', inplace=True)

In [None]:
# 替换指定列中的非数值型值为0.0001
for one_col in phy_che_list[1:-1]:
    df_result[one_col] = pd.to_numeric(df_result[one_col], errors='coerce').fillna(0.0001)

In [None]:
df_result

In [None]:
# 删除ydlb为0且ph为空的行
df_result_bc = df_result_bc.drop(df_result_bc[(df_result_bc['ydlb'] == 0) & (df_result_bc['ph'].isnull())].index)

# 删除ydlb为1且yypbh末尾不为11的行
# 使用str.endswith()来检查yypbh的末尾字符
df_result_bc = df_result_bc.drop(df_result_bc[(df_result_bc['ydlb'] == 1) & (~df_result_bc['yypbh'].str.endswith('11'))].index)


In [None]:
df_result_bc.shape

In [None]:
# 找出ydbh重复的索引
duplicates = df_result_bc.duplicated(subset=['ydbh'], keep=False)
df_duplicates = df_result_bc[duplicates]

In [None]:
# 去除重复数据
df_result_bc.drop_duplicates(subset=['ydbh'], keep='first', inplace=True)

In [None]:
df_result_bc.shape

In [None]:
# 替换指定列中的非数值型值为0.0001
for one_col in phy_che_list[1:-1]:
    df_result_bc[one_col] = pd.to_numeric(df_result_bc[one_col], errors='coerce').fillna(0.0001)

In [None]:
# 保存数据
df_result_bc['ydbh'] = df_result_bc['ydbh'].astype('str')
df_result_bc['yypbh'] = df_result_bc['yypbh'].astype('str')

df_result_bc.to_excel(r"F:\collection_spb_info\sp_float_data\SB\sb_result.xlsx",index=False)
# df_result.to_csv(r"F:\collection_spb_info\sp_float_data\DY\dy_result.csv",index=False)

# 查看数据

In [None]:
df_result.columns

In [None]:
# 遍历df_result中的每一列
for column in df_result.columns[3:]:
    # 检查列的数据类型是否为数值型
    if pd.api.types.is_numeric_dtype(df_result[column]):
        # 创建一个图和两个子图
        fig, ax = plt.subplots(1, 2, figsize=(14, 5))

        # 在第一个子图上绘制直方图
        sns.histplot(df_result[column], kde=False, bins=30, ax=ax[0])
        ax[0].set_title(f'{column} Histogram')
        ax[0].set_xlabel(column)
        ax[0].set_ylabel('Frequency')

        # 在第二个子图上绘制正态分布检验图
        stats.probplot(df_result[column], plot=ax[1])
        ax[1].set_title(f'{column} Normal Q-Q Plot')

        # 调整布局并显示图
        plt.tight_layout()
        plt.show()
