In [None]:
# Install necessary libraries if not already installed
!pip install pandas python-docx openpyxl xlrd  pdf2docx
!pip install pdfplumber pdfminer


**Extract documents**

In [None]:
from docx import Document

# Function to extract text from a Word document
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text_data = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:  # Only add non-empty lines
            text_data.append(text)
    return text_data

def show_first_three_lines_preview(text_data):
    for line in text_data[:3]:  # Show first 3 lines for a preview
        print(line)

# Example Word document path
important_for_beg_doc = '../data/raw/ImportantKnowledgePointsForBeginnersOfInternationalTrade.docx'
bank_account_doc = '../data/raw/BankAccount.docx'
motorcycle_doc = '../data/raw/motorcycle.docx'
motorcycle_2_doc = '../data/raw/motorcycle2.docx'
motorcycle_3_doc = '../data/raw/motorcycle3.docx'
motorcycle_4_doc = '../data/raw/motorcycle4.docx'
motorcycle_5_doc = '../data/raw/motorcycle5.docx'
sales_terms_doc = '../data/raw/SalesTerms.docx'

motorcycle_brochure_pdf = '../data/raw/MotorcycleBrochure.pdf'
ideals_company_pdf = '../data/raw/IdealsCompanyNewItems.pdf'

# Extract text data
important_for_beg_data = extract_text_from_docx(important_for_beg_doc)
bank_account_data = extract_text_from_docx(bank_account_doc)
motorcycle_data = extract_text_from_docx(motorcycle_doc)
motorcycle_2_data = extract_text_from_docx(motorcycle_2_doc)
motorcycle_3_data = extract_text_from_docx(motorcycle_3_doc)
motorcycle_4_data = extract_text_from_docx(motorcycle_4_doc)
motorcycle_5_data = extract_text_from_docx(motorcycle_5_doc)
sales_terms_data = extract_text_from_docx(sales_terms_doc)

# Display the extracted text data
print('\nImportant for beginners:', end='\n')
show_first_three_lines_preview(important_for_beg_data)

print('\nBank account:', end='\n')
show_first_three_lines_preview(bank_account_data)

print('\nMotorcycle 2:', end='\n')
show_first_three_lines_preview(motorcycle_2_data)

print('\nMotorcycle 3:', end='\n')
show_first_three_lines_preview(motorcycle_3_data)

print('\nMotorcycle 4:', end='\n')
show_first_three_lines_preview(motorcycle_4_data)

print('\nMotorcycle 5:', end='\n')
show_first_three_lines_preview(motorcycle_5_data)

print('\nSales terms:', end='\n')
show_first_three_lines_preview(sales_terms_data)



**Extract tables from PDF to CSV**

In [None]:
import pdfplumber

motorcycle_brochure_csv_path = '../data/formatted/MotorcycleBrochure.csv'

def extract_tables_from_pdf(pdf_path):
    """
    Extract all tables from a PDF file
    
    Args:
        pdf_path: Path to the PDF file
    
    Returns:
        List of pandas DataFrames, one for each table found
    """
    tables = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract tables from the page
            page_tables = page.extract_tables(
                table_settings={
                    "vertical_strategy": "text",  # Use text for vertical lines
                    "horizontal_strategy": "text",  # Use text for horizontal lines
                    "intersection_y_tolerance": 10,  # Adjust if rows are misaligned
                    "intersection_x_tolerance": 10   # Adjust if columns are misaligned
                }
            )
            
            # Convert each table to a pandas DataFrame
            for table in page_tables:
                if table:  # Check if table is not empty
                    # Get header from first row
                    header = table[0]
                    # Get data from remaining rows
                    data = table[1:]
                    # Create DataFrame
                    df = pd.DataFrame(data, columns=header)
                    # Clean up: remove empty rows and columns
                    df = df.dropna(how='all').dropna(axis=1, how='all')
                    tables.append(df)
    
    return tables

# Example usage:
print("Extracting tables from PDF:")
tables = extract_tables_from_pdf(motorcycle_brochure_pdf)

# Print each table
for i, df in enumerate(tables, 1):
    print(f"\nTable {i}:")
    print(df)
    print("-" * 80)

# If you need to extract a specific table, you can access it by index
if tables:
    first_table = tables[0]
    print("\nFirst table as CSV:")
    first_table.to_csv(motorcycle_brochure_csv_path, index=False)

**Extract dataframes**

In [None]:
import pandas as pd

# Example file paths
product_price_list_file_path = '../data/raw/ProductPriceList.xlsx'
cdk_file_path = '../data/raw/CDKsets.xlsx'
exhibition_brocure_csv_path = '../data/formatted/ExhibitionBrochure.csv'

# Load Excel and CSV files into DataFrames
product_price_list_df = pd.read_excel(product_price_list_file_path, sheet_name=0)  # Specify sheet_name if multiple sheets
cdk_df = pd.read_excel(cdk_file_path, sheet_name=0)
exhibition_brocure = pd.read_csv(exhibition_brocure_csv_path, sheet_name=0)
motorcycle_brochure = pd.read_csv(motorcycle_brochure_csv_path, sheet_name=0)

# Display the first few rows of each DataFrame to inspect the data
print('Product price list:')
print(product_price_list_df.head())

print('CDKsets:')
print(cdk_df.head())

print('Exhibition Brochure:')
print(exhibition_brocure.head())

print('Motorcycle Brochure:')
print(motorcycle_brochure.head())