In [2]:
# build a service that takes information about a car and return the predicted price for that car. 
import pandas as pd
from bs4 import BeautifulSoup
import os

path = "./content/data"
docs = os.listdir(path)
df_cols = ['name', 'model', 'year', 'price', 'color', 'fuelType', 'carOrigin', 'carInsurance', 'gearType', 'mirrorType', 'motorPower', 'drivenKm', 'passengers', 'paymentMethod', 'saleType', 'secondHandStatus']

In [304]:
# extract files from tarfile !DONE FOR ONCE ONLY!
# import tarfile

# tar = tarfile.open("./data.tar.gz")
# tar.extractall()
# tar.close()

In [3]:
def convert_arabic(text):
  arabic_numbers = {'٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4', '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9'}
  cont = list(text)
  for i in range(len(cont)):
    if cont[i] in arabic_numbers:
      cont[i] = arabic_numbers[cont[i]]
  cont = ''.join(cont)
  return cont

def convert_passengers(text):
  cont = convert_arabic(text).replace(" ","")
  if len(cont) > 1:
    try:
      return int(cont[0]) + int(cont[2])
    except:
      return int(list(cont)[0])
  else:
    try:
      return int(cont)
    except:
      return None

def convert_drivenKm(text, motorPower):
  data = convert_arabic(text)
  try :
    num = int(data)
    if num == motorPower:
      return None
    if num == 0:
      return None
    elif num < 100:
      return num * 1000
    elif num > 1000000:
      return None
    else:
      return num
  except:
    if "لف" in data or "كم" in data or "km" in data or "," in data:
      try:
        num = int(data[0:1])
        if num == motorPower:
          return None
        return num * 1000
      except:
        return None
    if len(data) > 7 or len(data) < 2:
      return None
  return None

def second_hand_status(data):
  text = convert_arabic(data)
  if "ول" in text or ("0" in text or "1" in text and not "10" in text) or "صفر" in text:
    return 1
  elif "ثاني" in text or "2" in text:
    return 2
  elif "ثالث" in text or "3" in text:
    return 3
  elif "رابع" in text or "4" in text:
    return 4
  elif "خامس" in text or "5" in text:
    return 5
  elif "سادس" in text or "6" in text:
    return 6
  elif "سابع" in text or "7" in text:
    return 7
  elif "ثامن" in text or "8" in text:
    return 8
  elif "تاسع" in text or "9" in text:
    return 9
  elif "عاشر" in text or "10" in text:
    return 10
  if len(text) < 3:
    return None
  else :
    return None

def payment_method(data):
  if len(data) < 6 or "دفع" in data or "كثر" in data:
    return None
  return data

def get_data(cont):
  try:
      drivingTableRow = cont.find("table",{"class":"driving-table"}).tr
      listAddsTableRows = cont.find("table",{"class":"list_ads"}).findAll("tr")
      diff = 0
      title = drivingTableRow.td.h3.text
      name = title.split(" ")[0]
      model = title.split(" ")[len(title.split(" "))-1]
      year = drivingTableRow.td.h5.text.split(" ")[2]
      price = drivingTableRow.findAll("td")[1+diff].h5.text.split("\n")[0].split(" ")[0]
      color = listAddsTableRows[1].findAll("td")[1+diff].text.replace(" ","")
      fuelType = listAddsTableRows[2].findAll("td")[1+diff].text.replace(" ","")
      carOrigin = listAddsTableRows[3].findAll("td")[1+diff].text.replace(" ","")
      carInsurance = listAddsTableRows[4].findAll("td")[1+diff].text.replace(" ","")
      gearType = listAddsTableRows[5].findAll("td")[1+diff].text.replace(" ","")
      mirrorType = listAddsTableRows[6].findAll("td")[1+diff].text.replace(" ","")
      motorPower = listAddsTableRows[7].findAll("td")[1+diff].text.replace(" ","")
      if len(listAddsTableRows) == 13:
        diff = -1
      drivenKm = listAddsTableRows[8+diff].findAll("td")[1].text
      drivenKm = convert_drivenKm(drivenKm,motorPower)
      passengers = listAddsTableRows[9+diff].findAll("td")[1].text
      passengers = convert_passengers(passengers)
      paymentMethod = listAddsTableRows[10+diff].findAll("td")[1].text
      paymentMethod = payment_method(paymentMethod)
      saleType = listAddsTableRows[11+diff].findAll("td")[1].text
      secondHandStatus = listAddsTableRows[12+diff].findAll("td")[1].text
      secondHandStatus = second_hand_status(secondHandStatus)
      car_data = [
        name, model, year, price, color, fuelType, carOrigin,
        carInsurance, gearType, mirrorType, motorPower, drivenKm,
        passengers, paymentMethod, saleType, secondHandStatus
      ]
      return car_data
  except:
    pass

In [None]:
## Main Code ##
## extracting features from html files
df = pd.DataFrame(columns=df_cols)
cars = set()

for idx, doc in enumerate(docs):
  with open(path + "/" + doc, 'r', encoding="utf8") as file:
    # if idx == 200:
    #   break
    cont = file.read()
    cont = BeautifulSoup(cont, 'html.parser')
    car = get_data(cont)
    if car is None:
      continue
    newDf = pd.DataFrame([car], columns=df_cols)
    df = pd.concat([df, newDf], ignore_index=True)
    file.close()

df
df.to_csv('./content/cars_v1.csv', index=False)

In [None]:
# conduct an explorative data analysis for the data
# conduct data cleaning and transformation if needed


In [None]:
# - conduct feature engineering
# - select the best model for the problem between Polynomial Regression, kNN, and Decision Tree

In [None]:
# - create a microservice that predicts the car price given certain input data

# - the code should be shared in a private GitHub repo