# 数据连接和清洗 基于xjsh_data

In [64]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [65]:
def find_file_with_string(path, string):
    """获取指定路径下包含指定字符串的文件路径,优先选择csv文件"""
    result_files = []
    
    # 遍历目录获取所有csv和xlsx文件
    for root, _, files in os.walk(path):
        # 先添加所有csv文件
        for file in files:
            if string in file and file.endswith('.csv'):
                result_files.append(os.path.join(root, file))
                
        # 再添加不重名的xlsx文件
        csv_names = {os.path.splitext(os.path.basename(f))[0] for f in result_files}
        for file in files:
            if string in file and file.endswith('.xlsx'):
                name = os.path.splitext(os.path.basename(file))[0]
                if name not in csv_names:
                    result_files.append(os.path.join(root, file))
    
    if not result_files:
        raise FileNotFoundError(f"在{path}路径下未找到包含{string}的csv或xlsx文件")
        
    return result_files
# 输入excel或csv文件路径列表，合并excel或cv文件为一个DataFrame
def merge_excel_files(file_list,use_columns):
    df_list = []
    for file in file_list:
        if file.endswith('.csv'):
            df = pd.read_csv(file,usecols=use_columns)
        elif file.endswith('.xlsx'):
            df = pd.read_excel(file,usecols=use_columns)
        df_list.append(df)
    # 合并所有DataFrame为一个
    merged_df = pd.concat(df_list, ignore_index=True)
    return merged_df

In [66]:
# province_name
province_name = '贵州省'

In [67]:
# 分析对象路径
folder_path = rf'F:\collection_spb_info\XJSH\ALL_DATA\ALL_BASE\{province_name}'
# 检测信息对象路径
ch_data_path = rf'F:\collection_spb_info\XJSH\ALL_DATA\ALL_JCJG\{province_name}'

In [68]:
base_file_list = find_file_with_string(folder_path, 'base_info')
ldtj_file_list = find_file_with_string(folder_path, 'ldtj_info')
pm_file_list = find_file_with_string(folder_path, 'pm_info')
ch_file_list = find_file_with_string(ch_data_path, 'all_info')

In [132]:
# 读取数据
base_df = merge_excel_files(base_file_list,base_info_list)
ldtj_df = merge_excel_files(ldtj_file_list,ldtj_info_list)
pm_df = merge_excel_files(pm_file_list,pm_info_list)
ch_df = merge_excel_files(ch_file_list,phy_che_list)

In [150]:
# 先连接前两个表
df_temp1 = pd.merge(base_df, ldtj_df, on='ydbh', how='left')
# 再连接第三个表
df_temp2 = pd.merge(df_temp1, pm_df, on='ydbh', how='left')
# 最后连接第四个表
df_base_ch = pd.merge(df_temp2, ch_df, on='ydbh', how='left')

In [None]:
df_base_ch.shape

In [154]:
# 替换指定列中的非数值型值为0.0001
for one_col in phy_che_list[:-1]:
    df_base_ch[one_col] = pd.to_numeric(df_base_ch[one_col], errors='coerce').fillna(0.0001)

In [None]:
# 保存数据
from datetime import datetime

# 将所有列转换为适当的数据类型
df_base_ch['ydbh'] = df_base_ch['ydbh'].astype('str')
df_base_ch['yypbh'] = df_base_ch['yypbh'].astype('str')
# 给ydbh,yypbh第一位加T
df_base_ch['ydbht'] = 'T' + df_base_ch['ydbh']
df_base_ch['yypbht'] = 'T' + df_base_ch['yypbh']

# 将数值型列转换为float类型并填充0.0001
numeric_columns = df_base_ch.select_dtypes(include=['int32', 'float32', 'int64', 'float64']).columns
for col in numeric_columns:
    df_base_ch[col] = df_base_ch[col].astype('float32').fillna(0.0001)

# 将字符串列转换为string类型并填充空字符串
string_columns = df_base_ch.select_dtypes(include=['object']).columns
for col in string_columns:
    df_base_ch[col] = df_base_ch[col].astype('string').fillna('')

# 保存路径
save_path = rf"F:\collection_spb_info\XJSH\ALL_DATA\ALL_RESULT\{province_name}_all_info_{datetime.now().strftime('%Y%m%d')}.csv"

# 获取目录，如果没有则创建
if not os.path.exists(os.path.dirname(save_path)):
    os.makedirs(os.path.dirname(save_path))
else:
    print("目录已存在")

# 保存时指定数据类型,并设置dtype参数
df_base_ch.to_csv(save_path, index=False, encoding='utf-8-sig')


In [None]:
test_df = pd.read_csv(save_path,encoding='utf-8-sig')


# 查看数据

In [None]:
df_base_ch.columns

In [None]:
# 遍历df_result中的每一列
for column in df_base_ch.columns[4:-1]:
    # 检查列的数据类型是否为数值型
    if pd.api.types.is_numeric_dtype(df_base_ch[column]):
        # 创建一个图和两个子图
        fig, ax = plt.subplots(1, 2, figsize=(14, 5))

        # 在第一个子图上绘制直方图
        sns.histplot(df_base_ch[column], kde=False, bins=30, ax=ax[0])
        ax[0].set_title(f'{column} Histogram')
        ax[0].set_xlabel(column)
        ax[0].set_ylabel('Frequency')

        # 在第二个子图上绘制正态分布检验图
        stats.probplot(df_base_ch[column], plot=ax[1])
        ax[1].set_title(f'{column} Normal Q-Q Plot')

        # 调整布局并显示图
        plt.tight_layout()
        plt.show()
