In [1]:
import warnings
warnings.filterwarnings('ignore')
import phpserialize as php # 处理PHP序列化的数据
from tqdm import tqdm
import pandas as pd
import os
import re
import json

In [2]:
# 获取刊登数据
publish_sku = pd.read_csv(r'G:\Jupyter\YT全量sku-重构\Data_input\截至20240812全上架ItemID.csv').fillna('')
publish_sku= publish_sku[publish_sku['listingstatus']=='Active'].copy()
publish_sku.reset_index(drop=True, inplace=True)
publish_sku = publish_sku[['country','site','selleruserid', 'itemid','sku','listingstatus','itemspecifics']]

In [3]:
def decode_bytes(bytes_str): # 解码
    if isinstance(bytes_str, dict):
        return {decode_bytes(k): decode_bytes(v) for k, v in bytes_str.items()}
    elif isinstance(bytes_str, list):
        return [decode_bytes(item) for item in bytes_str]
    elif isinstance(bytes_str, bytes):
        return bytes_str.decode('utf-8')
    else:
        return bytes_str

def filter_list(ls): # 过滤列表中不存在数字部分
    # pattern = re.compile(r'^[a-zA-Z\s]+$') # 纯英文
    pattern = re.compile(r'\d')
    return [item for item in ls if pattern.search(item)]

def process_row(row): # 对每行的php序列化数据进行处理
    php_serialized_str = row['itemspecifics']
    ItemSpecific_data = php.loads(php_serialized_str.encode('utf-8'))
    decoded_data = decode_bytes(ItemSpecific_data)
    
    df_ItemSpecific = pd.DataFrame(decoded_data['NameValueList'])
    df_ItemSpecific.columns = df_ItemSpecific.iloc[0]
    df_ItemSpecific = df_ItemSpecific[1:]
    
    # 分离出带有特定关键词的列生成新df，用于合并形成完整oe
    keyword_cols = df_ItemSpecific.filter(regex='(?i)number|oe|nummer|numéro|Produktgruppe').fillna('')
    
    # 处理含上述关键词的新df，将数据整合成字符串
    oe_publish = []
    for col in keyword_cols:
        try:
            oe_publish.append(';'.join(filter_list(keyword_cols[col].values.tolist())))
        except:
            oe_values = [str(val) for val in keyword_cols[col].values.tolist()]
            oe_publish.append(';'.join(filter_list(oe_values)))
    oe_publish = ';'.join([oe for oe in oe_publish if oe])
    return pd.Series({'OE_刊登': oe_publish})



def process_row(row): # 对每行的php序列化数据进行处理
    php_serialized_str = row['itemspecifics']
    try:
        ItemSpecific_data = php.loads(php_serialized_str.encode('utf-8'))
        decoded_data = decode_bytes(ItemSpecific_data)
        df_ItemSpecific = pd.DataFrame(decoded_data['NameValueList'])
        df_ItemSpecific.columns = df_ItemSpecific.iloc[0]
        df_ItemSpecific = df_ItemSpecific[1:2]
    except:
        decoded_data = decode_bytes(str(php_serialized_str))
        df_ItemSpecific = pd.DataFrame()
    # 分离出带有特定关键词的列生成新df，用于合并形成完整oe
    keyword_cols = df_ItemSpecific.filter(regex='(?i)number|oe|nummer|numéro|Produktgruppe').fillna('')
    # 处理含上述关键词的新df，将数据整合成字符串
    oe_publish = []
    for col in keyword_cols:
        try:
            oe_publish.append(';'.join(filter_list(keyword_cols[col].values.tolist())))
        except:
            oe_values = [str(val[1]) for val in keyword_cols[col].values.tolist()[0].items()]
            oe_publish.append(';'.join(filter_list(oe_values)))
    oe_publish = ';'.join([oe for oe in oe_publish if oe])
    df_ItemSpecific['OE_刊登'] = oe_publish
    return df_ItemSpecific[['OE_刊登']] # 只返回新生成的列

def process_in_batches(df, batch_size):
    result_dfs = []
    for batch_start in tqdm(range(0, len(df), batch_size), desc='Processing Batches', ncols=100):
        batch_end = batch_start + batch_size
        df_batch = df.iloc[batch_start:batch_end]

        # 处理每个批次
        batch_results = df_batch.apply(process_row, axis=1)
        batch_results_df = pd.concat(batch_results.tolist(), ignore_index=True)
        result_dfs.append(batch_results_df)

    return pd.concat(result_dfs, ignore_index=True)

In [4]:
# 得到解码后合并的刊登oe
if __name__ == "__main__":
    batch_size = 50 # 50个一组（分组处理加快速度）
    publish_sku_processed = process_in_batches(publish_sku, batch_size)
    publish_sku_combined = pd.concat([publish_sku.reset_index(drop=True), publish_sku_processed.reset_index(drop=True)], axis=1)

Processing Batches: 100%|███████████████████████████████████| 40008/40008 [1:07:51<00:00,  9.83it/s]


In [5]:
# 删除所有括号及括号内所有内容
publish_sku_combined['OE_刊登'] = publish_sku_combined['OE_刊登'].apply(lambda x:re.sub(r'[(（][^)）]*[)）]', '', str(x)))
publish_sku_combined['OE_刊登'] = publish_sku_combined['OE_刊登'].apply(lambda x:str(x).replace(r'nan', ''))
publish_sku_combined

Unnamed: 0,country,site,selleruserid,itemid,sku,listingstatus,itemspecifics,OE_刊登
0,DE,Italy,carparts_premium,2.032360e+11,Car-DECAKBM018,Active,"a:1:{s:13:""NameValueList"";a:13:{i:0;a:3:{s:4:""...","32111139316, 32111139315, 31351091764"
1,DE,France,carparts_premium,2.032360e+11,Car-DEDLA01935,Active,"a:1:{s:13:""NameValueList"";a:17:{i:0;a:3:{s:4:""...","2047201935,2047200735;2047201935,2047200735"
2,DE,Italy,carparts_premium,2.032360e+11,Car-DEDLA37849,Active,"a:1:{s:13:""NameValueList"";a:18:{i:0;a:3:{s:4:""...","51217185692, 51217155154"
3,DE,Italy,carparts_premium,2.032360e+11,Car-DEDLA85687,Active,"a:1:{s:13:""NameValueList"";a:16:{i:0;a:3:{s:4:""...",51227185687
4,DE,Spain,carparts_premium,2.032360e+11,Car-DEDLA01935,Active,"a:1:{s:13:""NameValueList"";a:16:{i:0;a:3:{s:4:""...","2047201935,2047200735;2047201935,2047200735"
...,...,...,...,...,...,...,...,...
2000358,US,eBayMotors,total_autopart,1.664349e+11,T-USRAD01486+USCON43930,Active,"a:1:{s:13:""NameValueList"";a:36:{i:0;a:3:{s:4:""...",
2000359,DE,Germany,premium-autoteile,3.549989e+11,P-DECAK20891-C,Active,"a:1:{s:13:""NameValueList"";a:18:{i:0;a:3:{s:4:""...",
2000360,DE,Germany,carparts_premium,2.038918e+11,Car-DEDS76982-C,Active,"a:1:{s:13:""NameValueList"";a:24:{i:0;a:3:{s:4:""...",
2000361,AU,Australia,zma-autoparts,3.347298e+11,Z-EAF10910-C,Active,"a:1:{s:13:""NameValueList"";a:30:{i:0;a:3:{s:4:""...",


In [6]:
publish_sku_combined['sku_匹配'] =  publish_sku_combined['sku'].copy()
# 删除所有括号及括号内所有内容
publish_sku_combined['sku_匹配'] = publish_sku_combined['sku_匹配'].apply(lambda x:re.sub(r'[(（][^)）]*[)）]', '', x))
for i in range(5): #以"%%%-"(%表单字符串)开头的内容删掉，循环五次，删除干净（例如P-DEB-sku）
    publish_sku_combined['sku_匹配'] = publish_sku_combined['sku_匹配'].apply(lambda x: re.sub(r'^[^-]*-', '', x) if ('-' in x and len(x.split('-')[0]) < 4) else x)
# 删除"DEB-"
publish_sku_combined['sku_匹配'] = publish_sku_combined['sku_匹配'].apply(lambda x:x.replace('DEB-', '').replace('-de', ''))

def process_sku1(row):
    if row['country'] != 'DE':
        return row['sku_匹配'][2:].replace('+US', '+').replace('+UK', '+').replace('+BM', '+')
    elif row['sku_匹配'][:2] == 'DE':
        return row['sku_匹配'][2:].replace('+DE', '+')
    elif row['site'] == 'France' and row['sku_匹配'][:2] == 'FR':
        return row['sku_匹配'][2:].replace('+FR', '+').replace('+DEB', '+')
    else:
        return row['sku_匹配'].replace('+DEB', '+')

def process_sku2(r):
    if r[-2:] == '-C':
        return re.sub(r'[^a-zA-Z0-9]+[a-zA-Z0-9]*$', '', r[:-2]) + '-C'
    elif r[-3:] == '-FC':
        return re.sub(r'[^a-zA-Z0-9]+[a-zA-Z0-9]*$', '', r[:-3]) + '-FC'
    else:
        return re.sub(r'[^a-zA-Z0-9]+[a-zA-Z0-9]*$', '', r)

def process_sku3(row): #先分割加号再对每个sku进行后缀处理，嵌套两次处理已经足够（可以改成while加正则判断是否存在）
    if '+' in row['sku_匹配']:
        list_split_sku = [process_sku2(process_sku2(r)) for r in row['sku_匹配'].split('+')]
        return '+'.join(list_split_sku)
    else:
        return process_sku2(process_sku2(row['sku_匹配']))

publish_sku_combined['sku_匹配'] = publish_sku_combined.apply(process_sku1, axis=1)
publish_sku_combined['sku_匹配'] = publish_sku_combined.apply(process_sku3, axis=1)
publish_sku_combined['清理后刊登oe'] =  publish_sku_combined['OE_刊登'].copy().fillna('')

In [7]:
# oe号进一步清洗
def oe_clean(row):
    oe_publish_clean = re.sub('\（','(',row['清理后刊登oe']) #将所有中文"（"符号替换为英文"("
    oe_publish_clean = re.sub('\）',')',oe_publish_clean) #将所有中文"）"符号替换为英文")"
    oe_publish_clean = re.sub('\\([^)]*\\)','',oe_publish_clean)  #删除所有"()"及其内部的字符（因为通常不是OE号）
    oe_publish_clean = re.sub('[/ .\-!*+]', '', oe_publish_clean)  #删掉'/'、'-'、'.'、'!'、'*'和空格
    oe_publish_clean = re.sub('[,，]', ';', oe_publish_clean) #中英文逗号替换为英文分号
    oe_publish_clean = re.sub('\u200e', '', oe_publish_clean) #清除不可见字符
    list1 = [i for i in str(oe_publish_clean).split(';') if i != '' and i.isalpha() == False] #去除空值
    list1 = filter_list(list1) # 去除完全由非数字组成部分
    list1 = [re.sub(r'\r','',i) for i in list1]
    list2 = list(set(list1)) #去重
    list2.sort(key = list1.index)# 去重后保持OE号原来的顺序
    oe_publish_clean = ';'.join(list2)
    try:
        if oe_publish_clean[0] == ';':
            oe_publish_clean = oe_publish_clean[1:]
        elif oe_publish_clean[-1] == ';':
            oe_publish_clean = oe_publish_clean[:-1]
    except:
        pass
    return str(oe_publish_clean.upper())

publish_sku_combined['清理后刊登oe'] = publish_sku_combined.apply(oe_clean, axis=1)
publish_sku_combined.to_csv('./Data_input/全上架ID/截至20240812全上架ItemID.csv')
publish_sku_combined = publish_sku_combined[publish_sku_combined['sku'].str.find('DEB-')==-1].copy()
publish_sku_combined

Unnamed: 0,country,site,selleruserid,itemid,sku,listingstatus,itemspecifics,OE_刊登,sku_匹配,清理后刊登oe
0,DE,Italy,carparts_premium,2.032360e+11,Car-DECAKBM018,Active,"a:1:{s:13:""NameValueList"";a:13:{i:0;a:3:{s:4:""...","32111139316, 32111139315, 31351091764",CAKBM018,32111139316;32111139315;31351091764
1,DE,France,carparts_premium,2.032360e+11,Car-DEDLA01935,Active,"a:1:{s:13:""NameValueList"";a:17:{i:0;a:3:{s:4:""...","2047201935,2047200735;2047201935,2047200735",DLA01935,2047201935;2047200735
2,DE,Italy,carparts_premium,2.032360e+11,Car-DEDLA37849,Active,"a:1:{s:13:""NameValueList"";a:18:{i:0;a:3:{s:4:""...","51217185692, 51217155154",DLA37849,51217185692;51217155154
3,DE,Italy,carparts_premium,2.032360e+11,Car-DEDLA85687,Active,"a:1:{s:13:""NameValueList"";a:16:{i:0;a:3:{s:4:""...",51227185687,DLA85687,51227185687
4,DE,Spain,carparts_premium,2.032360e+11,Car-DEDLA01935,Active,"a:1:{s:13:""NameValueList"";a:16:{i:0;a:3:{s:4:""...","2047201935,2047200735;2047201935,2047200735",DLA01935,2047201935;2047200735
...,...,...,...,...,...,...,...,...,...,...
2000358,US,eBayMotors,total_autopart,1.664349e+11,T-USRAD01486+USCON43930,Active,"a:1:{s:13:""NameValueList"";a:36:{i:0;a:3:{s:4:""...",,RAD01486+CON43930,
2000359,DE,Germany,premium-autoteile,3.549989e+11,P-DECAK20891-C,Active,"a:1:{s:13:""NameValueList"";a:18:{i:0;a:3:{s:4:""...",,CAK20891-C,
2000360,DE,Germany,carparts_premium,2.038918e+11,Car-DEDS76982-C,Active,"a:1:{s:13:""NameValueList"";a:24:{i:0;a:3:{s:4:""...",,DS76982-C,
2000361,AU,Australia,zma-autoparts,3.347298e+11,Z-EAF10910-C,Active,"a:1:{s:13:""NameValueList"";a:30:{i:0;a:3:{s:4:""...",,F10910-C,
