In [27]:
from bs4 import BeautifulSoup
import os
import re
import csv
import pandas as pd

In [28]:
csv_file_path = 'C:\\Users\\hp\\OneDrive\\Desktop\\assM\\car_information.csv'

In [29]:
csv_header = ['Model', 'Year', 'Price', 'Additions', 'Previous Owners', 'Offered', 'Payment Method', 'Number of Passengers', 'Car Mileage', 'Engine Power', 'Glass', 'Transmission Type', 'Car License', 'Car Origin', 'Fuel Type', 'Car Color', 'Third Party', 'Comprehensive (Vehicle Body)', 'Total Insurance']


In [30]:
folder_path = 'C:\\Users\\hp\\OneDrive\\Desktop\\data'

#### Create a CSV file with the specified header if it doesn't exist.

In [31]:
def create_csv(csv_file_path, csv_header):
    """
    Parameters:
    - csv_file_path (str): The path to the CSV file.
    - csv_header (list): The header to be written to the CSV file.
    """
    if not os.path.exists(csv_file_path):
        with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(csv_header)

#### Extract insurance-related data from the BeautifulSoup object.

In [32]:
def extract_insurance_data(soup):
    """
     Parameters:
    - soup (BeautifulSoup): The BeautifulSoup object representing the HTML content.

    Returns:
    - tuple: A tuple containing third_party_value, supplementary_value, and total_insurance_value.
    """
    insurance_table = soup.find(class_='row')
    if insurance_table:
        third_party = insurance_table.find('td', string='طرف ثالث')
        third_party_value = int(third_party.find_next('td').get_text(strip=True)) if third_party else None

        supplementary = insurance_table.find('td', string='التكميلي (جسم المركبة)')
        supplementary_value = int(supplementary.find_next('td').get_text(strip=True)) if supplementary else None

        total_insurance = insurance_table.find('td', string='المجموع')
        total_insurance_value = int(total_insurance.find_next('td').get_text(strip=True)) if total_insurance else None

        return third_party_value, supplementary_value, total_insurance_value
    return None, None, None


#### Extract insurance-related data from the BeautifulSoup object.

In [33]:
def extract_insurance_data(soup):
    """
    Parameters:
    - soup (BeautifulSoup): The BeautifulSoup object representing the HTML content.

    Returns:
    - tuple: A tuple containing third_party_value, supplementary_value, and total_insurance_value.
    """
    insurance_table = soup.find(class_='row')
    if insurance_table:
        third_party = insurance_table.find('td', string='طرف ثالث')
        third_party_value = int(third_party.find_next('td').get_text(strip=True)) if third_party else None

        supplementary = insurance_table.find('td', string='التكميلي (جسم المركبة)')
        supplementary_value = int(supplementary.find_next('td').get_text(strip=True)) if supplementary else None

        total_insurance = insurance_table.find('td', string='المجموع')
        total_insurance_value = int(total_insurance.find_next('td').get_text(strip=True)) if total_insurance else None

        return third_party_value, supplementary_value, total_insurance_value
    return None, None, None

####  Extract car details from the table rows of the HTML content.

In [34]:
def extract_car_details(table_rows, soup):
    """
    Parameters:
    - table_rows (ResultSet): ResultSet of table rows from BeautifulSoup.
    - soup (BeautifulSoup): The BeautifulSoup object representing the HTML content.

    Returns:
    - dict: A dictionary containing car details.
    """
    car_details = {}
    for row in table_rows:
        columns = row.find_all('td')
        if len(columns) >= 2:
            attribute = columns[0].text.strip()
            value = columns[1].text.strip()
            car_details[attribute] = value
        else:
            # Handle the case where there are not enough columns in the row
            pass
    return car_details

####  Extract the list of additions from the BeautifulSoup object.

In [35]:
def extract_additions_list(soup):
    """
    Parameters:
    - soup (BeautifulSoup): The BeautifulSoup object representing the HTML content.

    Returns:
    - list: A list containing additions, or None if not found.
    """
    additions_section = soup.find('td', string='إضافات')
    if additions_section:
        next_td = additions_section.find_next('td', class_='list-additions')
        additions_list = [li.text.strip() for li in next_td.find_all('li')]
        return additions_list
    return None


#### Process a text file, extract relevant information, and return a list of data.

In [36]:
def process_text_file(file_path):
    """
    Parameters:
    - file_path (str): The path to the text file.

    Returns:
    - list: A list containing extracted data.
    """
    with open(file_path, 'r', encoding='utf-8') as txt_file:
        txt_content = txt_file.read()
        soup = BeautifulSoup(txt_content, 'html.parser')
        car_name = soup.h3.get_text(strip=True)
        car_model = soup.h5.get_text(strip=True).split()[-1]
        price_element = soup.find(class_='post-price')
        car_price = re.search(r'\d+', price_element.get_text(strip=True)).group() if price_element else None

        third_party_value, supplementary_value, total_insurance_value = extract_insurance_data(soup)

        table_rows = soup.find_all('tr', class_='list-row')
        car_details = extract_car_details(table_rows, soup)

        additions_list = extract_additions_list(soup)

        # Store the extracted data in variables
        car_color = car_details.get('لون السيارة', '')
        fuel_type = car_details.get('نوع الوقود', '')
        car_origin = car_details.get('أصل السيارة', '')
        license_plate = car_details.get('رخصة السيارة', '')
        transmission_type = car_details.get('نوع الجير', '')
        glass_type = car_details.get('الزجاج', '')
        engine_power = car_details.get('قوة الماتور', '')
        mileage = car_details.get('عداد السيارة', '')
        passenger_capacity = car_details.get('عدد الركاب', '')
        payment_method = car_details.get('وسيلة الدفع', '')
        for_sale_status = car_details.get('معروضة', '')
        previous_owners = car_details.get('أصحاب سابقون', '')

        # Check if additions_list is not None before joining
        additions_str = ', '.join(additions_list) if additions_list is not None else None

        return [
            car_name, car_model, car_price,
            additions_str,
            previous_owners, for_sale_status,
            payment_method, passenger_capacity, mileage,
            engine_power, glass_type, transmission_type,
            license_plate, car_origin, fuel_type, car_color, third_party_value, supplementary_value, total_insurance_value
        ]


#### Process all text files in a folder, extract relevant information, and append to a CSV file.


In [37]:
def process_folder(folder_path, csv_file_path, csv_header):
    """
    Parameters:
    - folder_path (str): The path to the folder containing text files.
    - csv_file_path (str): The path to the CSV file.
    - csv_header (list): The header to be written to the CSV file.
    """
    create_csv(csv_file_path, csv_header)

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            data_row = process_text_file(file_path)

            # Append the details to the CSV file
            with open(csv_file_path, 'a', encoding='utf-8', newline='') as csv_file:
                writer = csv.writer(csv_file)
                writer.writerow(data_row)


In [38]:
process_folder(folder_path, csv_file_path, csv_header)

In [40]:
df = pd.read_csv(csv_file_path, encoding='utf-8')

In [41]:
df.head(5)

Unnamed: 0,Model,Year,Price,Additions,Previous Owners,Offered,Payment Method,Number of Passengers,Car Mileage,Engine Power,Glass,Transmission Type,Car License,Car Origin,Fuel Type,Car Color,Third Party,Comprehensive (Vehicle Body),Total Insurance
0,كيا اوبتيما,2014,100000.0,"مُكيّف, إغلاق مركزي, جهاز إنذار, مسجل CD, فتحة...",يد اولى,للبيع فقط,نقدا فقط,4+1,75000.0,2000.0,الكتروني,اوتوماتيك,فلسطينية,خصوصي,بنزين,أبيض عاجي,1340.0,1751.0,3090.0
1,معرض السيارات,فلسطين,,,,,,,,,,,,,,,,,
2,معرض السيارات,فلسطين,,,,,,,,,,,,,,,,,
3,كيا سورينتو,2007,60000.0,"مُكيّف, إغلاق مركزي, جهاز إنذار, مسجل CD, فتحة...",2,للبيع أو التبديل,إمكانية التقسيط,7+1,130000.0,2500.0,الكتروني,نصف اوتوماتيك,فلسطينية,خصوصي,ديزل,سكني,1690.0,1050.0,2740.0
4,هونداي افانتي,2006,43500.0,"مُكيّف, إغلاق مركزي, جهاز إنذار, مسجل CD, جنطا...",,للبيع فقط,نقدا فقط,,,1600.0,الكتروني,اوتوماتيك,فلسطينية,خصوصي,بنزين,سكني,1340.0,1000.0,2340.0
