This notebook is for buildings related with UMA UJA UKC

In [1]:
import pandas as pd
import re

In [2]:
# read eqp file as df
eqp_file_path = r'C:\Users\yuriy\Desktop\db_tables\EQPDB\clean_excel\Попозиционный учет законтрактованного 19.10.2023.xlsx'
eqp_cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 15, 16, 18, 19, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 39, 40, 41, 45, 46, 60]
df = pd.read_excel(eqp_file_path, sheet_name='Общий свод', usecols=eqp_cols)

In [3]:
# read supplier file as df
splr_file_path = r'C:\Users\yuriy\Desktop\db_tables\EQPDB\clean_excel\contract_supplier.xlsx'
supplier_df = pd.read_excel(splr_file_path)

In [4]:
col_name = ['asup', 'no', 'lot', 'type', 'set_code', 'kks_code', 'eqp_name', 'itt', 'unit', 'building', 'safety_class', 'qty', 'qty_unit',
            'mass_kg', 'mass_net', 'work_id', 'contract_no', 'contract_date', 'supplier', 'manufacturer', 'supervisor', 'division', 'unit_cost', 'unit_cost_wat','installation_cost', 'adjustment_service_cost', 'total_cost', 'delivery_fca', 'delivery_npp', 'delivery_expected', 'delivery_fact', 'incoming', 'pp2022' ]

supplier_cols = ['object', 'supervisor', 'supplier', 'contract_no', 'contract_date', 'contract_name']
df2 = df.copy()

In [5]:
df2.drop(range(2), inplace=True)
df2.columns = col_name
supplier_df.columns = supplier_cols

In [6]:
#common functions and variables
upper_letters = {
    'А': 'A',
    'В': 'B',
    'С': 'C',
    'Е': 'E',
    'О': 'O',
    'Р': 'P',
    'Х': 'X',
    'М': 'M',
    'К': 'K',
    'Т': 'T',
    'Н': 'H',

}
lower_letters = {
    'а': 'a',
    'с': 'c',
    'е': 'e',
    'о': 'o',
    'р': 'p',
    'х': 'x'
}

def replace_rueng(text, mapping):
    for old_char, new_char in mapping.items():
        text = text.replace(old_char, new_char)
    return text


def replace_engru(text, mapping):
    for old_char, new_char in mapping.items():
        text = text.replace(new_char, old_char)
    return text

In [7]:
#only 10UJA and 10UKC, except UKZ UMY UJZ
df2['building_2'] = ''
df2.loc[df2['building'].str.contains('10UJA', na=False, case=False), 'building_2'] = '10UJA'
df2.loc[df2['building'].str.contains('10UKC', na=False, case=False), 'building_2'] = '10UKC'
filtered_df = df2[df2['building_2'].str.contains('10UJA|10UKC', case=False, na=False )]
filtered_df = filtered_df[~filtered_df['building'].str.contains('UKZ|UMY|UJZ')]

#clean null values 
filtered_df.loc[:,'kks_code'] = filtered_df['kks_code'].replace(['-','–'], pd.NA)
filtered_df = filtered_df.dropna(how='any', subset=['asup','kks_code'])

# letter problem-solving 
filtered_df['asup'] = filtered_df['asup'].str.upper()
filtered_df['kks_code'] = filtered_df['kks_code'].str.upper()
filtered_df['work_id'] = filtered_df['work_id'].str.upper()
filtered_df['set_code'] = filtered_df['set_code'].str.upper()
filtered_df['building'] = filtered_df['building'].str.upper()
filtered_df['type'] = filtered_df['type'].str.capitalize()
filtered_df['type'] = filtered_df['type'].str.strip()
# Remove extra spaces within the string
filtered_df['manufacturer'] = filtered_df['manufacturer'].str.replace(r'\s+', ' ', regex=True)
filtered_df['manufacturer'] = filtered_df['manufacturer'].str.strip()


# Apply the function for upper case characters
filtered_df['asup'] = filtered_df['asup'].apply(lambda x: replace_rueng(x, upper_letters))
filtered_df['kks_code'] = filtered_df['kks_code'].apply(lambda x: replace_rueng(x, upper_letters))
filtered_df['work_id'] = filtered_df.apply(lambda row: replace_rueng(row['work_id'], upper_letters) if isinstance(row['work_id'], str) and  len(row['work_id']) > 8 else row['work_id'], axis=1)
filtered_df['set_code'] = filtered_df.apply(lambda row: replace_rueng(row['set_code'], upper_letters) if isinstance(row['set_code'], str) and  len(row['set_code']) > 8 else row['set_code'], axis=1)
filtered_df['building'] = filtered_df['building'].apply(lambda x: replace_rueng(x, upper_letters))


# split joined values ex: AKKU00195603\nAKKU00195604\nAKKU00195605\nAKKU00025933\nAKKU00195606\nAKKU00195607\nAKKU00189695
split_df = filtered_df[filtered_df['asup'].apply(len)>12]
df_A = split_df.assign(asup=split_df['asup'].str.split('\n')).explode('asup')
df_B = split_df.assign(kks_code=split_df['kks_code'].str.split('\n')).explode('kks_code')

df_A['kks_code'] = df_B['kks_code']
filtered_df = pd.concat([filtered_df, df_A], axis=0)
filtered_df.drop(filtered_df[filtered_df['asup'].str.len() > 12].index, inplace=True)



filtered_df['no'] = filtered_df['no'].astype(dtype='int', copy=True)
filtered_df[['qty', 'mass_kg', 'mass_net']] = filtered_df[['qty', 'mass_kg', 'mass_net']].fillna(0)
filtered_df[['unit_cost', 'unit_cost_wat', 'installation_cost', 'adjustment_service_cost', 'total_cost']] = filtered_df[['unit_cost', 'unit_cost_wat', 'installation_cost', 'adjustment_service_cost', 'total_cost']].fillna(0) 

filtered_df['contract_date'] = pd.to_datetime(filtered_df['contract_date'], errors='coerce')
filtered_df['delivery_fca'] = pd.to_datetime(filtered_df['delivery_fca'], errors='coerce')
filtered_df['delivery_npp'] = pd.to_datetime(filtered_df['delivery_npp'], errors='coerce')
filtered_df['delivery_expected_status'] = filtered_df['delivery_expected'].copy()
filtered_df['delivery_fact_status'] = filtered_df['delivery_fact'].copy()

date_format = '%Y-%m-%d'
filtered_df['delivery_expected_status'] = filtered_df['delivery_expected'].where(pd.to_datetime(filtered_df['delivery_expected'], format=date_format, errors='coerce').isna())
filtered_df.loc[:, 'delivery_expected'] =  pd.to_datetime(filtered_df['delivery_expected'], format=date_format, errors='coerce')
filtered_df['delivery_fact_status'] = filtered_df['delivery_fact'].where(pd.to_datetime(filtered_df['delivery_fact'], format=date_format,  errors='coerce').isna())
filtered_df.loc[:, 'delivery_fact'] =  pd.to_datetime(filtered_df['delivery_fact'], format=date_format, errors='coerce')
filtered_df['delivery_expected_status'] = filtered_df['delivery_expected_status'].str.capitalize()
filtered_df['delivery_fact_status'] = filtered_df['delivery_fact_status'].str.capitalize()

building_re = {
    'UBB': '10UBB',
    'UJB': '10UJB',
    'UJC': '10UJC',
    'UJE': '10UJE',
    '11UBP': '11UBP',
    '12UBP': '12UBP',
    'UKA': '10UKA',
    'UJG': '10UJG',
    'UKC': '10UKC',
    'UJA': '10UJA'
}
for pattern, replacement in building_re.items():
    filtered_df.loc[filtered_df['building'].str.contains(pattern, na=False, case=False), 'building'] = replacement

filtered_df.loc[:, 'manufacturer'] = filtered_df['manufacturer'].str.split('/').str[0]

filtered_df.loc[:, 'eqp_name'] = filtered_df['eqp_name'].str.split('/').str[0]
filtered_df.loc[:, 'eqp_name'] = filtered_df['eqp_name'].str.split('.').str[0]

del split_df, df_A, df_B

In [None]:
#qty_unit
qty_re = {
    'шт': 'pcs',
    'pcs':'psc',
    'кг': 'kg',
    'компл': 'set',
    'Комплект': 'set',
    'к-т': 'set',
    'set': 'set',
    'pc': 'pcs'
}
for old_value, new_value in (qty_re.items()):
    # print(filtered_df.loc[filtered_df['qty_unit'].str.contains(old_value, case=False, na=False), 'qty_unit'])
    filtered_df.loc[filtered_df['qty_unit'].str.contains(old_value, case=False, na=False), 'qty_unit'] = new_value


In [8]:
filtered_df['contract_no'] = filtered_df['contract_no'].str.replace('№', '')
filtered_df['contract_no'] = filtered_df['contract_no'].str.strip()
filtered_df['contract_no'] = filtered_df['contract_no'].astype(str)
supplier_df['contract_no'] = supplier_df['contract_no'].astype(str)
filtered_df = pd.merge(filtered_df, supplier_df[['contract_no', 'supplier']], on=['contract_no'], how='left')
filtered_df = filtered_df.rename(columns={'supplier_y': 'supplier'}).drop(columns=['supplier_x'])

In [9]:
# convert data types
filtered_df['mass_kg'] = pd.to_numeric(filtered_df['mass_kg'], errors='coerce')
filtered_df['mass_net'] = pd.to_numeric(filtered_df['mass_kg'], errors='coerce')
filtered_df['delivery_expected'] = pd.to_datetime(filtered_df['delivery_expected'], errors='coerce')
filtered_df['delivery_fact'] = pd.to_datetime(filtered_df['delivery_fact'], errors='coerce')


In [14]:
df2.loc[df2['building'].str.contains('10UJA', na=False, case=False), ['kks_code', 'building']]

Unnamed: 0,kks_code,building
2,11JNG12AA201,Реакторное здание (10UJA). Код помещения: 10U...
3,11JNG12AA202,Реакторное здание (10UJA). Код помещения: 10U...
4,11JNG12AA203,Реакторное здание (10UJA). Код помещения: 10U...
5,11JNG12AA204,Реакторное здание (10UJA). Код помещения: 10U...
6,11JNG12AA206,Реакторное здание (10UJA). Код помещения: 10U...
...,...,...
124542,12FAK51AA102,10UJA
124543,12FAK51AA401,10UJA
124544,12FAK51AA402,10UJA
124545,12FAK51AA501,10UJA


In [10]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44246 entries, 0 to 44245
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   asup                      44246 non-null  object        
 1   no                        44246 non-null  int32         
 2   lot                       44246 non-null  object        
 3   type                      44246 non-null  object        
 4   set_code                  2486 non-null   object        
 5   kks_code                  44246 non-null  object        
 6   eqp_name                  44246 non-null  object        
 7   itt                       43729 non-null  object        
 8   unit                      44246 non-null  object        
 9   building                  44246 non-null  object        
 10  safety_class              44233 non-null  object        
 11  qty                       44246 non-null  float64       
 12  qty_unit          

In [None]:
max_lengths = filtered_df.map(lambda x: len(str(x))).max()
max_lengths

In [None]:
filtered_df['supplier'].unique()

In [None]:
manufacturer =  ((filtered_df['manufacturer'].dropna().drop_duplicates())
                 .str.replace('Российская Федерация', ''))

In [None]:
manufacturer.loc[manufacturer.str.contains('Dembla Valves', na=False, case=False)] = 'Dembla Valves'
manufacturer.loc[manufacturer.str.contains('ARAKO spol', na=False, case=False)] = 'ARAKO spol. s r.o.'
manufacturer.loc[manufacturer.str.contains('Control Instruments', na=False, case=False)] = 'RK Control Instruments Pvt. Ltd.'
manufacturer.loc[manufacturer.str.contains('ИнтерПолярис', na=False, case=False)] = 'ООО НПП "ИнтерПолярис"'
manufacturer.loc[manufacturer.str.contains('Знамя труда', na=False, case=False)] = 'АО "Завод Знамя труда"'
manufacturer.loc[manufacturer.str.contains('Техзащита', na=False, case=False)] = 'ООО "Техзащита"'
manufacturer.loc[manufacturer.str.contains('Энергоспецмонтаж', na=False, case=False)] = 'ООО "Энергоспецмонтаж"'
manufacturer.loc[manufacturer.str.contains('Спецпроект', na=False, case=False)] = 'ООО "Спецпроект"'
manufacturer.loc[manufacturer.str.contains('Автоматика', na=False, case=False)] = 'ЗАО НПП "Автоматика"'
manufacturer.loc[manufacturer.str.contains('ПФ "ОКА"', na=False, case=False)] = 'ООО ПФ "ОКА"'
manufacturer.loc[manufacturer.str.contains('Гидромаш', na=False, case=False)] = 'ООО Гидромаш-Технология'
manufacturer.loc[manufacturer.str.contains('Катайский насосный завод', na=False, case=False)] = 'АО "Катайский насосный завод"'
manufacturer.loc[manufacturer.str.contains('Атоммаш', na=False, case=False)] = 'АО "Атоммашэкспорт"'
manufacturer.loc[manufacturer.str.contains('МашТЭК', na=False, case=False)] = 'ООО "НПП МашТЭК"'
manufacturer.loc[manufacturer.str.contains('ТИТАН ТЕХНОЛОДЖИ', na=False, case=False)] = 'ООО "ТИТАН ТЕХНОЛОДЖИ ПАЙПЛАЙН"'
manufacturer.loc[manufacturer.str.contains('НИКИМТ-Атомстрой', na=False, case=False)] = 'АО "НИКИМТ-Атомстрой"'
manufacturer.loc[manufacturer.str.contains('Атомспецсервис', na=False, case=False)] = 'ООО "Атомспецсервис"'
manufacturer.loc[manufacturer.str.contains('Контур', na=False, case=False)] = 'ПАО "Контур"'
manufacturer.loc[manufacturer.str.contains('САЗ', na=False, case=False)] = 'ЗАО "САЗ"'
manufacturer.loc[manufacturer.str.contains('Schwingungsisolierungen', na=False, case=False)] = 'GERB Schwingungsisolierungen GmbH & Co KG'
manufacturer.loc[manufacturer.str.contains('Энергопоток', na=False, case=False)] = 'АО "ЗЭО Энергопоток"'
manufacturer.loc[manufacturer.str.contains('Энергепоток', na=False, case=False)] = 'АО "ЗЭО Энергопоток"'
manufacturer.loc[manufacturer.str.contains('ЭНEPГOПOTOK', na=False, case=False)] = 'АО "ЗЭО Энергопоток"'
manufacturer.loc[manufacturer.str.contains('Бекар', na=False, case=False)] = 'ООО НПК "Бекар"'
manufacturer.loc[manufacturer.str.contains('Купол', na=False, case=False)] = 'АО "ИЭМЗ "Купол"'
manufacturer.loc[manufacturer.str.contains('КЦКБА', na=False, case=False)] = 'ЧАО "КЦКБА"'
manufacturer.loc[manufacturer.str.contains('АКТАН', na=False, case=False)] = 'ООО "Актан"'
manufacturer.loc[manufacturer.str.contains('нпо спец', na=False, case=False)] = 'АО "НПО Спецматериалов"'
manufacturer.loc[manufacturer.str.contains('нпо см', na=False, case=False)] = 'АО "НПО Спецматериалов"'

manufacturer.unique()

In [None]:
from fuzzywuzzy import fuzz
def find_similar_values(series, threshold=80):
    similar_values = []

    for i in range(len(series)):
        for j in range(i+1, len(series)):
            similarity = fuzz.ratio(series.iloc[i], series.iloc[j])
            if similarity >= threshold:
                similar_values.append((series.iloc[i], series.iloc[j], similarity))

    return similar_values

# Find similar values
similar_values = find_similar_values(manufacturer)

# Print similar values
for val1, val2, similarity in similar_values:
    if similarity < 100:
        print(f"Similarity: {similarity}%\n{val1}\n{val2}\n")


In [None]:
filtered_df.loc[filtered_df['itt'].apply(lambda x: len(str(x)) if pd.notna(x) else 0) > 20, ['itt', 'asup']]

In [None]:
test_df = filtered_df.loc[filtered_df['set_code'].apply(lambda x: len(str(x)) if pd.notna(x) else 0) > 20, ['set_code', 'asup']]

In [None]:
#output
col_name = ['asup', 'no', 'lot', 'type', 'set_code', 'kks_code', 'eqp_name', 'itt', 'unit', 'building', 'building_2', 'safety_class', 'qty', 'qty_unit', 'mass_kg', 
            'mass_net', 'work_id', 'contract_no', 'contract_date', 'supplier', 'manufacturer', 'supervisor', 'division', 'unit_cost', 'unit_cost_wat',
            'installation_cost', 'adjustment_service_cost', 'total_cost', 'delivery_fca', 'delivery_npp', 'delivery_expected','delivery_expected_status', 'delivery_fact', 'delivery_fact_status', 'incoming', 'pp2022' ]
filtered_df = filtered_df[col_name]

filtered_df.to_excel(r'C:\Users\yuriy\Desktop\db_tables\EQPDB\clean_excel\equipment_clean.xlsx', index=False, float_format="%.4f")

In [None]:
filtered_df['kks_len'] = filtered_df['kks_code'].apply(lambda x: len(str(x)))


In [None]:
filtered_df['kks_len'].value_counts()

In [None]:
filtered_df.loc[filtered_df['kks_len'] == 12, 'kks_code']