In [108]:
import pandas as pd
import re

In [109]:
# read eqp file as df
eqp_file_path = r'C:\Users\yuriy\Desktop\db_tables\EQPDB\clean_excel\equipment.xlsx'
eqp_cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 15, 16, 18, 19, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 39, 40, 41, 45, 46, 60]
df = pd.read_excel(eqp_file_path, sheet_name='Общий свод', usecols=eqp_cols)

In [110]:
# read supplier file as df
splr_file_path = r'C:\Users\yuriy\Desktop\db_tables\EQPDB\clean_excel\contract_supplier.xlsx'
supplier_df = pd.read_excel(splr_file_path)

In [111]:
col_name = ['asup', 'no', 'lot', 'type', 'set_code', 'kks_code', 'eqp_name', 'itt', 'unit', 'building', 'safety_class', 'qty', 'qty_unit', 'mass_kg', 
            'mass_net', 'work_id', 'contract_no', 'contract_date', 'supplier', 'manufacturer', 'supervisor', 'division', 'unit_cost', 'unit_cost_wat',
            'installation_cost', 'adjustment_service_cost', 'total_cost', 'delivery_fca', 'delivery_npp', 'delivery_expected', 'delivery_fact', 'incoming', 'pp2022' ]
supplier_cols = ['object', 'supervisor', 'supplier', 'contract_no', 'contract_date', 'contract_name']
df2 = df.copy()

In [112]:
df2.drop(range(2), inplace=True)
df2.columns = col_name
supplier_df.columns = supplier_cols

In [113]:
#only 10UJA and 10UKC, except UKZ UMY UJZ
df2['building_2'] = ''
df2.loc[df2['building'].str.contains('10UJA', na=False, case=False), 'building_2'] = '10UJA'
df2.loc[df2['building'].str.contains('10UKC', na=False, case=False), 'building_2'] = '10UKC'
filtered_df = df2[df2['building_2'].str.contains('10UJA|10UKC', case=False, na=False )]
filtered_df = filtered_df[~filtered_df['building'].str.contains('UKZ|UMY|UJZ')]

#clean null values 
filtered_df.loc[:,'kks_code'] = filtered_df['kks_code'].replace(['-','–'], pd.NA)
filtered_df = filtered_df.dropna(how='any', subset=['asup','kks_code'])

#letter problem solving 
upper_letters = {
    'А': 'A',
    'В':'B',
    'С': 'C',
    'Е': 'E',
    'О': 'O',
    'Р': 'P',
    'Х': 'X',
    'М': 'M',
    'К':'K',
    'Т':'T',
    'Н':'H',

}
lower_letters = {
    'а': 'a',
    'с': 'c',
    'е': 'e',
    'о': 'o',
    'р': 'p',
    'х': 'x'
}
filtered_df['asup'] = filtered_df['asup'].str.upper()
filtered_df['kks_code'] = filtered_df['kks_code'].str.upper()
filtered_df['work_id'] = filtered_df['work_id'].str.upper()

def replace_similar_chars(text, mapping):
    for old_char, new_char in mapping.items():
        text = text.replace(old_char, new_char)
    return text

# Apply the function for lower case characters
filtered_df['asup'] = filtered_df['asup'].apply(lambda x: replace_similar_chars(x, lower_letters))
filtered_df['kks_code'] = filtered_df['kks_code'].apply(lambda x: replace_similar_chars(x, lower_letters))
filtered_df['work_id'] = filtered_df.apply(lambda row: replace_similar_chars(row['work_id'], upper_letters) if isinstance(row['work_id'], str) and  len(row['work_id']) > 8 else row['work_id'], axis=1)


filtered_df['no'] = filtered_df['no'].astype(dtype='int', copy=True)
filtered_df[['qty', 'mass_kg', 'mass_net']] = filtered_df[['qty', 'mass_kg', 'mass_net']].fillna(0)
filtered_df[['unit_cost', 'unit_cost_wat', 'installation_cost', 'adjustment_service_cost', 'total_cost']] = filtered_df[['unit_cost', 'unit_cost_wat', 'installation_cost', 'adjustment_service_cost', 'total_cost']].fillna(0) 

filtered_df['contract_date'] = pd.to_datetime(filtered_df['contract_date'], errors='coerce')
filtered_df['delivery_fca'] = pd.to_datetime(filtered_df['delivery_fca'], errors='coerce')
filtered_df['delivery_npp'] = pd.to_datetime(filtered_df['delivery_npp'], errors='coerce')
filtered_df['delivery_expected_status'] = filtered_df['delivery_expected'].copy()
filtered_df['delivery_fact_status'] = filtered_df['delivery_fact'].copy()

date_format = '%Y-%m-%d'
filtered_df['delivery_expected_status'] = filtered_df['delivery_expected'].where(pd.to_datetime(filtered_df['delivery_expected'], format=date_format, errors='coerce').isna())
filtered_df.loc[:, 'delivery_expected'] =  pd.to_datetime(filtered_df['delivery_expected'], format=date_format, errors='coerce')
filtered_df['delivery_fact_status'] = filtered_df['delivery_fact'].where(pd.to_datetime(filtered_df['delivery_fact'], format=date_format,  errors='coerce').isna())
filtered_df.loc[:, 'delivery_fact'] =  pd.to_datetime(filtered_df['delivery_fact'], format=date_format, errors='coerce')

building_re = {
    'UBB': 'UBB',
    'UJB': 'UJB',
    'UJC': 'UJC',
    'UJE': 'UJE',
    'UBP': 'UBP',
    'UKA': 'UKA',
    'UJG': 'UJG',
    'UKC': 'UKC',
    'UJA': 'UJA'
}
for pattern, replacement in building_re.items():
    filtered_df.loc[:, 'building'] = filtered_df['building'].str.replace(pattern, replacement)
    


In [114]:
#qty_unit
qty_re = {
    'шт': 'pcs',
    'pcs':'psc',
    'кг': 'kg',
    'компл': 'set',
    'Комплект': 'set',
    'к-т': 'set',
    'set': 'set',
    'pc': 'pcs'
}
for old_value, new_value in (qty_re.items()):
    # print(filtered_df.loc[filtered_df['qty_unit'].str.contains(old_value, case=False, na=False), 'qty_unit'])
    filtered_df.loc[filtered_df['qty_unit'].str.contains(old_value, case=False, na=False), 'qty_unit'] = new_value


In [None]:
filtered_df['qty_unit'].unique()

In [115]:
filtered_df['contract_no'] = filtered_df['contract_no'].str.replace('№', '')
filtered_df['contract_no'] = filtered_df['contract_no'].str.replace('№', '')
filtered_df['contract_no'] = filtered_df['contract_no'].str.strip()
filtered_df['contract_no'] = filtered_df['contract_no'].astype(str)
supplier_df['contract_no'] = supplier_df['contract_no'].astype(str)
filtered_df = pd.merge(filtered_df, supplier_df[['contract_no', 'supplier']], on=['contract_no'], how='left')
filtered_df = filtered_df.rename(columns={'supplier_y': 'supplier'}).drop(columns=['supplier_x'])

In [116]:
# convert data types
filtered_df['mass_kg'] = pd.to_numeric(filtered_df['mass_kg'], errors='coerce')
filtered_df['mass_net'] = pd.to_numeric(filtered_df['mass_kg'], errors='coerce')
filtered_df['delivery_expected'] = pd.to_datetime(filtered_df['delivery_expected'], errors='coerce')
filtered_df['delivery_fact'] = pd.to_datetime(filtered_df['delivery_fact'], errors='coerce')


In [None]:
filtered_df['mass_kg'].info()

In [117]:
col_name = ['asup', 'no', 'lot', 'type', 'set_code', 'kks_code', 'eqp_name', 'itt', 'unit', 'building', 'safety_class', 'qty', 'qty_unit', 'mass_kg', 
            'mass_net', 'work_id', 'contract_no', 'contract_date', 'supplier', 'manufacturer', 'supervisor', 'division', 'unit_cost', 'unit_cost_wat',
            'installation_cost', 'adjustment_service_cost', 'total_cost', 'delivery_fca', 'delivery_npp', 'delivery_expected','delivery_expected_status', 'delivery_fact', 'delivery_fact_status', 'incoming', 'pp2022' ]
filtered_df = filtered_df[col_name]

In [118]:
filtered_df

Unnamed: 0,asup,no,lot,type,set_code,kks_code,eqp_name,itt,unit,building,...,adjustment_service_cost,total_cost,delivery_fca,delivery_npp,delivery_expected,delivery_expected_status,delivery_fact,delivery_fact_status,incoming,pp2022
0,AKKU00017132,1,0060/1/2018,оборудование,,11JNG12AA201,Устройство дроссельно-регулирующее,AKU-PAA0003,1,Реакторное здание (10UJA). Код помещения: 10U...,...,0.0,1440.0,2021-05-28,2021-06-28,NaT,Поставлено,2021-12-31,,СЕ-15-АВК-0435,2019-04-03
1,AKKU00017127,2,0060/1/2018,оборудование,,11JNG12AA202,Устройство дроссельно-регулирующее,AKU-PAA0003,1,Реакторное здание (10UJA). Код помещения: 10U...,...,0.0,1440.0,2021-05-28,2021-06-28,NaT,Поставлено,2021-12-31,,СЕ-15-АВК-0435,2019-04-03
2,AKKU00017141,3,0060/1/2018,оборудование,,11JNG12AA203,Устройство дроссельно-регулирующее,AKU-PAA0003,1,Реакторное здание (10UJA). Код помещения: 10U...,...,0.0,1440.0,2021-05-28,2021-06-28,NaT,Поставлено,2021-12-31,,СЕ-15-АВК-0435,2019-04-03
3,AKKU00017145,4,0060/1/2018,оборудование,,11JNG12AA204,Устройство дроссельно-регулирующее,AKU-PAA0003,1,Реакторное здание (10UJA). Код помещения: 10U...,...,0.0,1440.0,2021-05-28,2021-06-28,NaT,Поставлено,2021-12-31,,СЕ-15-АВК-0435,2019-04-03
4,AKKU00017146,5,0060/1/2018,оборудование,,11JNG12AA206,Устройство дроссельно-регулирующее,AKU-PAA0003,1,Реакторное здание (10UJA). Код помещения: 10U...,...,0.0,1440.0,2021-05-28,2021-06-28,NaT,Поставлено,2021-12-31,,СЕ-15-АВК-0435,2019-04-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43924,AKKU00026676,130426,AKU-22-201м,Материалы,,10KUC15AA602,Клапан обратный / Check Valve,AKU-PAA0001.B03,1,10UKC,...,0.0,4368.0,2024-01-07,2024-02-06,2024-02-06,,NaT,,,2023-10-01
43925,AKKU00328209,131338,AKU-22-279,Оборудование,,10PKN10BB001,Бак приготовления раствора тринатрийфосфата (с...,10PKN-PAA0001_B01,1,10UKC,...,225.0,18400.0,2024-03-07,2024-04-07,2024-04-07,,NaT,,,2023-10-10
43926,AKKU00328210,131339,AKU-22-279,Оборудование,,10PKN20BB001,Бак расходный раствора тринатрийфосата/Trisodi...,10PKN-PAA0001_B01,1,10UKC,...,225.0,14750.0,2024-03-07,2024-04-07,2024-04-07,,NaT,,,2023-10-10
43927,AKKU00328211,131340,AKU-22-279,Оборудование,,10PKN25AP001,Насос-дозатор раствора тринатрийфосфата/ Triso...,10PKN-PAA0002_B01,1,10UKC,...,225.0,19050.0,2024-03-07,2024-04-07,2024-04-07,,NaT,,,2023-10-10


In [119]:
filtered_df.to_excel(r'C:\Users\yuriy\Desktop\db_tables\EQPDB\clean_excel\equipment_clean.xlsx', index=False, float_format="%.4f")