Clean dataset

In [13]:
import pandas as pd
import openpyxl  # noqa: F401

def is_unnamed(header):
    return str(header).startswith("Unnamed")

def validate_first_sheet(df):
    return (
        is_unnamed(df.columns[1])
        and is_unnamed(df.columns[2])
        and is_unnamed(df.columns[3])
        and is_unnamed(df.columns[4])
        and is_unnamed(df.columns[5])
        and not pd.isna(df.columns[0])
        and not pd.isna(df.columns[6])
    )

def validate_other_sheet(df):
    return (
        df.shape[1] == 6
        and not is_unnamed(df.columns[1])
        and not is_unnamed(df.columns[2])
        and not is_unnamed(df.columns[3])
        and not is_unnamed(df.columns[4])
        and is_unnamed(df.columns[0])
    )

def process_files(articles_file_path, output_file_path):
    # Load the Excel file
    xls = pd.ExcelFile(articles_file_path)

    # Validate and load relevant sheets
    relevant_sheets = []
    first_sheet_validated = False

    for sheet_name in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet_name, header=0)
        if not first_sheet_validated:
            if validate_first_sheet(df):
                relevant_sheets.append((sheet_name, df))
                first_sheet_validated = True
            else:
                raise ValueError("First sheet is not valid")
        else:
            if validate_other_sheet(df):
                relevant_sheets.append((sheet_name, df))

    if not relevant_sheets:
        raise ValueError("No relevant sheets found in the Excel file")

    # Process the first sheet
    first_sheet_df = relevant_sheets[0][1].drop(columns=["mgs210"])

    # Align and merge other relevant sheets
    aligned_sheets = [first_sheet_df]
    for sheet_name, df in relevant_sheets[1:]:
        df.columns = first_sheet_df.columns
        aligned_sheets.append(df)

    # Concatenate all aligned sheets
    df_combined = pd.concat(aligned_sheets, ignore_index=True)

    # Drop rows where the third column (index 2) is empty or "."
    df_combined = df_combined[~df_combined.iloc[:, 2].isin(["", "."])]
    
    # Drop the "STAMPA LISTINI" column
    df_combined = df_combined.drop(columns=["STAMPA LISTINI"])
    
    # Print the number of columns for debugging
    print("Number of columns in df_combined:", len(df_combined.columns))
    print("Column names in df_combined:", df_combined.columns.tolist())

    # Rename the columns
    df_combined.columns = [
        "CODICE PRODOTTO",
        "BRAND",
        "DESCRIZIONE",
        "GIACENZA",
        "PRZ. ULT. ACQ.",
    ]

    # Add new columns
    df_combined["CODICE OE"] = pd.NA
    df_combined["CODICI CROSS"] = pd.NA
    df_combined["LINK IMMAGINE"] = pd.NA
    df_combined["CATEGORIA"] = pd.NA
    df_combined["SCHEDA TECNICA"] = pd.NA
    df_combined["SCHEDA DI SICUREZZA"] = pd.NA
    df_combined["CONFEZIONE"] = pd.NA
    df_combined["QUANTITÀ MINIMA"] = pd.NA
    df_combined["META.LUNGHEZZA"] = pd.NA
    df_combined["META.LARGHEZZA"] = pd.NA
    df_combined["META.PROFONDITA'"] = pd.NA
    df_combined["META. ..."] = pd.NA

    # Reorder the columns
    df_combined = df_combined[
        [
            "CODICE PRODOTTO",
            "CODICE OE",
            "CODICI CROSS",
            "BRAND",
            "DESCRIZIONE",
            "LINK IMMAGINE",
            "CATEGORIA",
            "PRZ. ULT. ACQ.",
            "GIACENZA",
            "SCHEDA TECNICA",
            "SCHEDA DI SICUREZZA",
            "CONFEZIONE",
            "QUANTITÀ MINIMA",
            "META.LUNGHEZZA",
            "META.LARGHEZZA",
            "META.PROFONDITA'",
            "META. ...",
        ]
    ]

    # Convert 'GIACENZA' and 'PRZ. ULT. ACQ.' columns to numeric types
    df_combined["GIACENZA"] = pd.to_numeric(
        df_combined["GIACENZA"].str.replace(",", "."), errors="coerce"
    )
    df_combined["PRZ. ULT. ACQ."] = pd.to_numeric(
        df_combined["PRZ. ULT. ACQ."].str.replace(",", "."), errors="coerce"
    )

    # Clean the data
    df_combined = df_combined[df_combined["GIACENZA"] > 0]
    df_combined = df_combined[df_combined["PRZ. ULT. ACQ."].notna()]

    # Save the cleaned data to a CSV file
    df_combined.to_csv(output_file_path, index=False)

# Placeholders for file paths
articles_file_path = "Z:/My Drive/rcs/oem cross/Articles26062024.xls"
output_file_path = "Z:/My Drive/rcs/oem cross/cleaned_Articles26062024.csv"

process_files(articles_file_path, output_file_path)


Number of columns in df_combined: 5
Column names in df_combined: ['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']


#MERGE OE CODES, BY APPENDING THEM#

In [20]:
import pandas as pd

def aggregate_oem_numbers(oems_df):
    # Ensure 'oem_number' and 'article_alt' columns are treated as strings
    oems_df['oem_number'] = oems_df['oem_number'].astype(str)
    oems_df['article_alt'] = oems_df['article_alt'].astype(str)
    
    # Aggregate OEM numbers for each article_alt
    oems_agg = oems_df.groupby('article_alt')['oem_number'].apply(lambda x: ' | '.join(x)).reset_index()
    return oems_agg

def append_oem_numbers(df_cleaned, oems_agg):
    # Ensure 'CODICE PRODOTTO' is treated as a string
    df_cleaned['CODICE PRODOTTO'] = df_cleaned['CODICE PRODOTTO'].astype(str)

    # Strip leading/trailing spaces and convert to uppercase for both columns
    df_cleaned['CODICE PRODOTTO'] = df_cleaned['CODICE PRODOTTO'].str.strip().str.upper()
    oems_agg['article_alt'] = oems_agg['article_alt'].str.strip().str.upper()

    # Merge the cleaned dataframe with the aggregated OEM numbers
    merged_df = pd.merge(df_cleaned, oems_agg, left_on='CODICE PRODOTTO', right_on='article_alt', how='left')

    # Rename the column for clarity
    merged_df.rename(columns={'oem_number': 'CODICE OE'}, inplace=True)

    # Drop the article_alt column as it's no longer needed
    merged_df.drop(columns=['article_alt'], inplace=True)

    return merged_df

def process_and_merge_files(cleaned_csv_path, oems_file_path, output_file_path):
    # Load the cleaned articles CSV file
    df_cleaned = pd.read_csv(cleaned_csv_path)

    # Load the OEM CSV file with specified delimiter and as string type
    oems_df = pd.read_csv(oems_file_path, delimiter=';', dtype=str)

    # Aggregate the OEM numbers
    oems_agg = aggregate_oem_numbers(oems_df)

    # Append OEM numbers to the cleaned dataframe
    df_result = append_oem_numbers(df_cleaned, oems_agg)

    # Save the final dataframe to a CSV file
    df_result.to_csv(output_file_path, index=False)

    # Print some sample values for debugging
    print("Sample CODICE PRODOTTO values:")
    print(df_cleaned['CODICE PRODOTTO'].head(10))  # Print more rows for inspection
    print("Sample article_alt values:")
    print(oems_agg['article_alt'].head(10))  # Print more rows for inspection
    print("Sample merged CODICE OE values:")
    print(df_result[['CODICE PRODOTTO', 'CODICE OE']].head(10))  # Print more rows for inspection

    # Print unique values in both columns for further inspection
    print("Unique CODICE PRODOTTO values:")
    print(df_cleaned['CODICE PRODOTTO'].unique()[:10])  # Print more unique values
    print("Unique article_alt values:")
    print(oems_agg['article_alt'].unique()[:10])  # Print more unique values

# Placeholders for file paths
articles_file_path = "Z:/My Drive/rcs/oem cross/cleaned_Articles26062024.csv"
oems_file_path = "Z:/My Drive/rcs/oem cross/oems.csv"
output_file_path = "Z:/My Drive/rcs/oem cross/OEMERGED_cleaned_Articles26062024.csv"

# Run the process
process_and_merge_files(articles_file_path, oems_file_path, output_file_path)

Sample CODICE PRODOTTO values:
0    C1E017ABE
1    C2E001ABE
2    C3E003ABE
3    C3E004ABE
4    C3E006ABE
5    C4E000ABE
6        K2012
7        K2013
8        K2014
9        K2015
Name: CODICE PRODOTTO, dtype: object
Sample article_alt values:
0     1268110M
1     1268650M
2     1277A740
3     1327A081
4     1327A501
5    66021626A
6    66021627A
7      880712Z
8      880725Z
9    8HC3022FS
Name: article_alt, dtype: object
Sample merged CODICE OE values:
  CODICE PRODOTTO CODICE OE                                          CODICE OE
0       C1E017ABE       NaN                                                NaN
1       C2E001ABE       NaN                                                NaN
2       C3E003ABE       NaN                                                NaN
3       C3E004ABE       NaN                                                NaN
4       C3E006ABE       NaN                                                NaN
5       C4E000ABE       NaN                                       

In [25]:
import pandas as pd

def split_csv_to_excel_sheets(input_csv_path, output_excel_path, rows_per_sheet=500000):
    # Create a Pandas Excel writer using openpyxl as the engine
    writer = pd.ExcelWriter(output_excel_path, engine='openpyxl')

    # Read the large CSV file in chunks
    chunk_iter = pd.read_csv(input_csv_path, chunksize=rows_per_sheet, delimiter=';', dtype=str )

    # Initialize sheet number
    sheet_number = 1

    for chunk in chunk_iter:
        # Write each chunk to a new sheet
        sheet_name = f'Sheet{sheet_number}'
        chunk.to_excel(writer, sheet_name=sheet_name, index=False)
        sheet_number += 1

    # Save the Excel file
    writer.save()

# Placeholders for file paths
input_csv_path = "Z:/My Drive/rcs/oem cross/oems_id.csv"
output_excel_path = "Z:/My Drive/rcs/oem cross/oems_id.xlsx"

# Run the split function
split_csv_to_excel_sheets(input_csv_path, output_excel_path, rows_per_sheet=500000)


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_large_csv_file.csv'

In [27]:
import pandas as pd
from tqdm import tqdm

def split_csv_to_excel_sheets(input_csv_path, output_excel_path, rows_per_sheet=500000):
    # Create a Pandas Excel writer using xlsxwriter as the engine
    writer = pd.ExcelWriter(output_excel_path, engine='xlsxwriter')

    # Read the large CSV file in chunks
    chunk_iter = pd.read_csv(input_csv_path, chunksize=rows_per_sheet, delimiter=';', dtype=str )

    # Initialize sheet number
    sheet_number = 1

    for chunk in tqdm(chunk_iter, desc="Processing chunks"):
        # Write each chunk to a new sheet
        sheet_name = f'Sheet{sheet_number}'
        chunk.to_excel(writer, sheet_name=sheet_name, index=False)
        sheet_number += 1

    # Close the Excel writer to save the file
    writer.close()

# Placeholders for file paths
input_csv_path = "Z:/My Drive/rcs/oem cross/oems_id.csv"
output_excel_path = "Z:/My Drive/rcs/oem cross/oems_id.xlsx"

# Run the split function
split_csv_to_excel_sheets(input_csv_path, output_excel_path, rows_per_sheet=1000000)



Processing chunks: 0it [00:00, ?it/s]