In [24]:
# Install necessary libraries if not already installed
!pip install pandas python-docx openpyxl xlrd  pdf2docx
!pip install pdfplumber pdfminer




**Extract documents**

In [25]:
from docx import Document

# Function to extract text from a Word document
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text_data = []
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:  # Only add non-empty lines
            text_data.append(text)
    return text_data

def show_first_three_lines_preview(text_data):
    for line in text_data[:3]:  # Show first 3 lines for a preview
        print(line)

# Example Word document path
important_for_beg_doc = '../data/raw/ImportantKnowledgePointsForBeginnersOfInternationalTrade.docx'
bank_account_doc = '../data/raw/BankAccount.docx'
motorcycle_doc = '../data/raw/motorcycle.docx'
motorcycle_2_doc = '../data/raw/motorcycle2.docx'
motorcycle_3_doc = '../data/raw/motorcycle3.docx'
motorcycle_4_doc = '../data/raw/motorcycle4.docx'
motorcycle_5_doc = '../data/raw/motorcycle5.docx'
sales_terms_doc = '../data/raw/SalesTerms.docx'

motorcycle_brochure_pdf = '../data/raw/MotorcycleBrochure.pdf'
ideals_company_pdf = '../data/raw/IdealsCompanyNewItems.pdf'

# Extract text data
important_for_beg_data = extract_text_from_docx(important_for_beg_doc)
bank_account_data = extract_text_from_docx(bank_account_doc)
motorcycle_data = extract_text_from_docx(motorcycle_doc)
motorcycle_2_data = extract_text_from_docx(motorcycle_2_doc)
motorcycle_3_data = extract_text_from_docx(motorcycle_3_doc)
motorcycle_4_data = extract_text_from_docx(motorcycle_4_doc)
motorcycle_5_data = extract_text_from_docx(motorcycle_5_doc)
sales_terms_data = extract_text_from_docx(sales_terms_doc)

# Display the extracted text data
print('\nImportant for beginners:', end='\n')
show_first_three_lines_preview(important_for_beg_data)

print('\nBank account:', end='\n')
show_first_three_lines_preview(bank_account_data)

print('\nMotorcycle 2:', end='\n')
show_first_three_lines_preview(motorcycle_2_data)

print('\nMotorcycle 3:', end='\n')
show_first_three_lines_preview(motorcycle_3_data)

print('\nMotorcycle 4:', end='\n')
show_first_three_lines_preview(motorcycle_4_data)

print('\nMotorcycle 5:', end='\n')
show_first_three_lines_preview(motorcycle_5_data)

print('\nSales terms:', end='\n')
show_first_three_lines_preview(sales_terms_data)




Important for beginners:
Import business: 
inquiry, offer, counter-offer, acceptance (market research/finding and investigating upstream and downstream customers) → contract signing (determining transaction method and payment method) → opening letter of credit → purchasing insurance, chartering a ship → import inspection and customs declaration → import payment (bank review and payment). 

Export business: 
inquiry, offer, counter-offer, acceptance → contract signing → stock preparation, inspection → certificate review → container/cabin booking → export customs clearance (customs declaration) → export collection (bank delivery and settlement) → export tax rebate. 

Several important points of knowledge that must be understood when engaging in international trade: 
1. Subject matter of international trade: 
Commodity name (Commodity) 
Commodity details (quality, quantity, specifications, etc.) 

2. The most commonly used international trade terms (price terms): 
CFR (Cost and Freight) 

Convert text rows of specifications to CSV

In [26]:
import re
import pandas as pd
from docx import Document

def parse_motorcycle_text(doc_path):
    # Define all possible fields to ensure consistency across entries
    all_fields = [
        "Model Name",
        "Reference Price (Yuan)", "Brand", "Model", "Production Method", 
        "Origin", "Engine", "Max Horsepower (Ps)", "Max Power/Speed (kW/rpm)", 
        "Max Torque/Speed (N·m/rpm)", "Transmission", "ABS", "CBS", 
        "Dimensions (LxWxH mm)", "Seat Height (mm)", "Curb Weight (kg)", 
        "Fuel Tank Capacity (L)", "Max Speed (km/h)", 
        "Official Average Fuel Consumption (L/100km)", 
        "Measured Average Fuel Consumption (L/100km)", "Range (km)", 
        "Warranty", "Available Colors"
    ]

    # Patterns to identify each field with generic regex patterns
    patterns = {
        "Model Name": re.compile(r'^\s*([^\n]+?)\s*(?=基本参数)', re.MULTILINE),
        "Reference Price (Yuan)": re.compile(r'Reference price \(yuan\)\s*([\d]+|Currently no quotation available)', re.IGNORECASE),
        "Brand": re.compile(r'Brand\s*(\w+)', re.IGNORECASE),
        "Model": re.compile(r'Model(?:,\s*([^,]+),\s*Production Method,|[:\s]+([^,\n]+?)(?:\s+Production Method|$))', re.IGNORECASE),
        "Production Method": re.compile(r'Production Method(?:,\s*([^,\n]+)|[:\s]+([^\n,]+))', re.IGNORECASE),
        "Origin": re.compile(r'Origin:\s*(.+?)(?:,|$)', re.IGNORECASE),
        "Engine": re.compile(r'Engine\s*(.+?)\s*maximum horsepower', re.IGNORECASE),
        "Max Horsepower (Ps)": re.compile(r'maximum horsepower \(Ps\)\s*([\d.]+)', re.IGNORECASE),
        "Max Power/Speed (kW/rpm)": re.compile(r'Maximum power/speed \(kW/rpm\)\s*([\d./]+)', re.IGNORECASE),
        "Max Torque/Speed (N·m/rpm)": re.compile(r'Maximum torque/speed \(N·m/rpm\)\s*([\d./]+)', re.IGNORECASE),
        "Transmission": re.compile(r'Transmission\s*(.+?)\s*ABS', re.IGNORECASE),
        "ABS": re.compile(r'ABS\s(.+?)\s', re.IGNORECASE),
        "CBS": re.compile(r'CBS\s(.+?)\s', re.IGNORECASE),
        "Dimensions (LxWxH mm)": re.compile(r'Length x Width x Height \(mm\)\s([\d.x]+)', re.IGNORECASE),
        "Seat Height (mm)": re.compile(r'Seat height \(mm\)\s([\d.]+)', re.IGNORECASE),
        "Curb Weight (kg)": re.compile(r'curb weight \(kg\)\s([\d.]+)', re.IGNORECASE),
        "Fuel Tank Capacity (L)": re.compile(r'Fuel tank capacity \(L\)\s([\d.]+)', re.IGNORECASE),
        "Max Speed (km/h)": re.compile(r'maximum speed \(km/h\)\s([\d.]+)', re.IGNORECASE),
        "Official Average Fuel Consumption (L/100km)": re.compile(r'Official average fuel consumption \(L/100km\)\s([\d.]+|-)'),
        "Measured Average Fuel Consumption (L/100km)": re.compile(r'measured average fuel consumption \(L/100km\)\s([\d.]+|-)'),
        "Range (km)": re.compile(r'Range \(km\)\s([\d.]+|-)', re.IGNORECASE),
        "Warranty": re.compile(r'Vehicle Warranty\s(.+?)\s', re.IGNORECASE),
        "Available Colors": re.compile(r'Available colors:\s*(.+)', re.IGNORECASE)
    }

    # Load the document
    doc = Document(doc_path)
    text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])

    # Initialize list to hold each motorcycle's data
    motorcycles = []

    # Find all model names to use as entry separators
    model_starts = list(patterns["Model Name"].finditer(text))
    
    # Process each entry based on model name positions
    for i in range(len(model_starts)):
        start = model_starts[i].start()
        # If this is the last entry, go to end of text, otherwise go to start of next entry
        end = model_starts[i + 1].start() if i < len(model_starts) - 1 else len(text)
        entry = text[start:end]

        print(f"\nProcessing entry {i}:")
        print("Raw entry:", entry[:200])  # Print first 200 chars of each entry
        
        entry_data = {field: None for field in all_fields}

        # Extract model name
        model_name_match = patterns["Model Name"].search(entry)
        if model_name_match:
            entry_data["Model Name"] = model_name_match.group(1).strip()
            # Remove the matched model name portion from the entry text
            entry = entry[model_name_match.end():]
        
        # Remove non-ASCII characters
        entry = ''.join([char for char in entry if ord(char) < 128])
        
        # Apply each regex pattern to find corresponding data
        for field, pattern in patterns.items():
            match = pattern.search(entry)
            if match:
                # Safely handle the groups
                matched_groups = [g for g in match.groups() if g is not None]
                if matched_groups:  # Make sure we have at least one match
                    value = matched_groups[0].strip()
                    entry_data[field] = value
                    print(f"Found {field}: '{value}'")  # Debug print
                else:
                    print(f"No value found for {field} despite pattern match")
            else:
                print(f"No match for {field}")

        # Only add entry if there's at least one value to avoid empty rows
        if any(entry_data.values()):
            motorcycles.append(entry_data)

    # Convert to DataFrame
    df = pd.DataFrame(motorcycles)
    return df

# Parse the sample text
file_path = '../data/raw/motorcycle.docx'
df = parse_motorcycle_text(file_path)
output_csv_path = '../data/formatted/motorcycle_specs.csv'
df.to_csv(output_csv_path, index=False)
print(f"Data saved to {output_csv_path}")

file_path = '../data/raw/motorcycle2.docx'
df = parse_motorcycle_text(file_path)
output_csv_path = '../data/formatted/motorcycle2_specs.csv'
df.to_csv(output_csv_path, index=False)
print(f"Data saved to {output_csv_path}")

file_path = '../data/raw/motorcycle3.docx'
df = parse_motorcycle_text(file_path)
output_csv_path = '../data/formatted/motorcycle3_specs.csv'
df.to_csv(output_csv_path, index=False)
print(f"Data saved to {output_csv_path}")

file_path = '../data/raw/motorcycle4.docx'
df = parse_motorcycle_text(file_path)
output_csv_path = '../data/formatted/motorcycle4_specs.csv'
df.to_csv(output_csv_path, index=False)
print(f"Data saved to {output_csv_path}")

file_path = '../data/raw/motorcycle5.docx'
df = parse_motorcycle_text(file_path)
output_csv_path = '../data/formatted/motorcycle5_specs.csv'
df.to_csv(output_csv_path, index=False)
print(f"Data saved to {output_csv_path}")




Processing entry 0:
Raw entry: QJMOTOR 赛150
基本参数
查看详细参数
参考价(元) 13999 品牌 QJMOTOR
车型 跑车 生产方式 国产
产地 中国 厂家 浙江钱江摩托股份有限公司
发动机 单缸四冲程 水冷·150cc 最大马力(Ps) 17.0
最大功率/转速(kW/rpm) 12.5/9250 最大扭矩/转速(N·m/rpm) 13.5/7250
变速器 国际6挡 ABS 标配前后
CBS - 长x宽x高
No match for Model Name
Found Reference Price (Yuan): '13999'
Found Brand: 'QJMOTOR'
Found Model: 'sports car'
Found Production Method: 'domestic production'
Found Origin: 'Zhejiang Qianjiang Motorcycle Co.'
Found Engine: 'single cylinder four stroke water-cooled  150cc'
Found Max Horsepower (Ps): '17.0'
Found Max Power/Speed (kW/rpm): '12.5/9250'
No match for Max Torque/Speed (N·m/rpm)
Found Transmission: 'international 6-speed'
Found ABS: 'standard'
Found CBS: '-'
Found Dimensions (LxWxH mm): '2010x770x1125'
Found Seat Height (mm): '765'
Found Curb Weight (kg): '145'
Found Fuel Tank Capacity (L): '12.5'
Found Max Speed (km/h): '99'
Found Official Average Fuel Consumption (L/100km): '-'
No match for Measured Average Fuel Consumption (L/100km)
Found Range (

**Extract tables from PDF to CSV**

Extract tables from PDFs and convert to dataframes

In [27]:
import pdfplumber

motorcycle_brochure_csv_path = '../data/formatted/brochure_motorcycle.csv'

def extract_tables_from_pdf(pdf_path):
    """
    Extract all tables from a PDF file
    
    Args:
        pdf_path: Path to the PDF file
    
    Returns:
        List of pandas DataFrames, one for each table found
    """
    tables = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract tables from the page
            page_tables = page.extract_tables(
                table_settings={
                    "vertical_strategy": "text",  # Use text for vertical lines
                    "horizontal_strategy": "text",  # Use text for horizontal lines
                    "intersection_y_tolerance": 10,  # Adjust if rows are misaligned
                    "intersection_x_tolerance": 10   # Adjust if columns are misaligned
                }
            )
            
            # Convert each table to a pandas DataFrame
            for table in page_tables:
                if table:  # Check if table is not empty
                    # Transpose the table to make first column the header
                    table_transposed = list(map(list, zip(*table)))
                    # Get header from first column
                    header = table_transposed[0]
                    # Get data from remaining columns
                    data = table_transposed[1:]
                    # Create DataFrame with transposed orientation
                    df = pd.DataFrame(data, columns=header).T
                    # Clean up: remove empty rows and columns
                    df = df.dropna(how='all').dropna(axis=1, how='all')
                    tables.append(df)

    # Reorient the dataframes to have proper header and row orientation
    reoriented_tables = []
    for df in tables:
        # Get the first column as the index (features)
        df = df.set_index(df.columns[0])
        # Transpose so features become columns and items become rows
        df = df.T
        # Reset index to make the item names a regular column
        df = df.reset_index()
        # Clean up column names
        df.columns = df.columns.str.strip()
        reoriented_tables.append(df)
    
    tables = reoriented_tables
    return tables

# Example usage:
print("Extracting tables from PDF:")
tables = extract_tables_from_pdf(motorcycle_brochure_pdf)

# Print each table
for i, df in enumerate(tables, 1):
    print(f"\nTable {i}:")
    print(df)
    print("-" * 80)

# If you need to extract a specific table, you can access it by index
if tables:
    first_table = tables[0]
    print("\nFirst table as CSV:")
    first_table.to_csv(motorcycle_brochure_csv_path, index=False)

Extracting tables from PDF:

Table 1:
0  index  Model   Product Dimensions(mm)   Wheel Base(mm)   Net Weight(kg)    \
0      1  MT150            1960×820×1220             1310              107     
1      2  MT150            1960×820×1220             1310              107     

0 Max.loading capacity(kg)  ... parts）(PCS)   Container loading （SKD      \
0                      150  ...                                            
1                      150  ...                                            

0 whole vehicle   packaging）(SET)            Price  
0     126/40’HQ                          RMB:3550元  
1     126/40’HQ                      RMB:3550 yuan  

[2 rows x 89 columns]
--------------------------------------------------------------------------------

First table as CSV:


**Extract dataframes**

[Download the GPT formatted ExhibitionBrochure parsed for CSV save to ../data/formatted/](https://www.dropbox.com/scl/fi/wxvqtbd9pdt88b4xe8vu5/ExhibitionBrochure.csv?rlkey=9sqplrh0wunzetjxv4n4bbx94&st=wsgahe5i&dl=1)

[Download the GPT formatted keywords_sales parsed for CSV save to ../data/checkpoints/](https://www.dropbox.com/scl/fi/336j34ocykkbzsatboqfu/keywords_sales.csv?rlkey=cz3axf32ccf9k8ldjfc5tbcsm&st=qnxx96pl&dl=1)

In [28]:
import pandas as pd

# Example file paths
product_price_list_file_path = '../data/raw/ProductPriceList.xlsx'
cdk_file_path = '../data/raw/CDKsets.xlsx'
exhibition_brocure_csv_path = '../data/formatted/brochure_exhibition.csv'

# Load Excel and CSV files into DataFrames
product_price_list_df = pd.read_excel(product_price_list_file_path, sheet_name=0)  # Specify sheet_name if multiple sheets
cdk_df = pd.read_excel(cdk_file_path, sheet_name=0)
exhibition_brocure = pd.read_csv(exhibition_brocure_csv_path)
motorcycle_brochure = pd.read_csv(motorcycle_brochure_csv_path)

# Display the first few rows of each DataFrame to inspect the data
print('Product price list:')
print(product_price_list_df.head())

print('CDKsets:')
print(cdk_df.head())

print('Exhibition Brochure:')
print(exhibition_brocure.head())

print('Motorcycle Brochure:')
print(motorcycle_brochure.head())

Product price list:
   Unnamed: 0       Unnamed: 1 Unnamed: 2          Unnamed: 3 Unnamed: 4  \
0         NaN     company name        NaN  Product Price List        NaN   
1         NaN  company address        NaN                 NaN        NaN   
2         NaN      P: telphone        NaN                 NaN        NaN   
3         NaN          website        NaN                 NaN        NaN   
4         NaN              NaN        NaN                 NaN        NaN   

  Unnamed: 5  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  
CDKsets:
  装        箱         单                             Unnamed: 1 Unnamed: 2  \
0        合同号：0711B2126                                    NaN        NaN   
1                 品  名            摩       托       车  (CKD)           NaN   
2                型   号                                   箱  号         数量   
3                CG125                             0001-1715        800套   
4                电器件组件                      

  warn("""Cannot parse header or footer so it will be ignored""")
