# Final Project - Scraping

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import statistics

In [2]:
n_pages = 17  # Change this to the desired number of pages

base_url = "http://zarmedee.mn/zar/1201/"  # Base URL of the website

base_url_2 = "http://zarmedee.mn"

apartment_urls = []  

for page_num in range(1, n_pages + 1):
    url = f"{base_url}/p/{page_num}"

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    url_divs = soup.find_all('div', class_='col-md-9 col-xs-12')
    apartment_urls += [f"{base_url_2}{url.find('a', href=True)['href']}" for url in url_divs]

#### I wanted to work on a project from the ground up having no prepared data or no outlined work but purely on my own, 
#### thus I decided to scrape my own data from zarmedee.mn, I initially wanted to scrape from Unegui, but could not, since it was too hard with beautiful soup.

#### It was quite a challenge to get the data from the website, I had to use a dictionary to plug in the extracted data into a df

In [3]:
desired_columns = ["Дүүрэг", "Байршил", "Хотхон", "Талбай (м.кв)", "Ашиглалтад орсон",
                   "Барилгын давхар", "Хэдэн давхарт", "Цонхны тоо", "Цонх", "Хаалга",
                   "Шал", "Гараж", "Үнэ"]

# Create an empty list to store dictionaries
data_list = []

for apartment_url in apartment_urls:
    # Send an HTTP request to the apartment URL
    response = requests.get(apartment_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the <div> with id="sidebar"
    sidebar_div = soup.find('div', {'id': 'sidebar', 'class': 'hidden-xs hidden-sm'})

    if sidebar_div:
        # Find all "row" elements inside the sidebar
        label_rows = sidebar_div.find_all('div', class_='row')

        # Create a dictionary to store the extracted data for this URL
        data_dict = {}

        for row in label_rows:
            label_element = row.find('label', class_='col-sm-7 control-label')
            p_element = row.find('p', class_='form-control-static text-primary')

            if label_element and label_element.text.strip() in desired_columns:
                label = label_element.text.strip()

                if p_element:
                    text_content = p_element.text.strip()
                    data_dict[label] = text_content
        # Find the price of the apartment and extract it
        money_buttons = sidebar_div.find_all('button', class_='btn-u btn-u-red btn-u-lg btn-block rounded-2x')

        if len(money_buttons) >= 2:
            # Extract the text content from the second button
            money_text = money_buttons[1].text.strip()
            data_dict["Үнэ"] = money_text
        else:
            print(f"Second money button not found in {apartment_url}")

        # Append the data dictionary to the list
        data_list.append(data_dict)
    else:
        print(f"Sidebar div not found in {apartment_url}")

# Create the DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)

# Display the DataFrame in a tabular format with aligned columns
with pd.option_context('display.colheader_justify', 'center'):
    print(df.to_string(index=False))

    Дүүрэг               Байршил                                  Хотхон                       Талбай (м.кв)  Ашиглалтад орсон Барилгын давхар Хэдэн давхарт Цонхны тоо     Цонх       Хаалга     Шал    Гараж         Үнэ        
       Хан-Уул                ХУД :: Зайсан                                      Энхжин хотхон          104м2       2017               9              6           4           Вакум      Бүргэд  Паркет Байхгүй          2.9 сая ₮
           NaN                          NaN                                                NaN            NaN        NaN             NaN            NaN         NaN             NaN         NaN     NaN     NaN         Үнэ: Асуух
       Баянгол         БГД :: 10-р хороолол                                                NaN            NaN        NaN             NaN            NaN         NaN             NaN         NaN     NaN     NaN           15 сая ₮
      Баянзүрх            БЗД :: Зүүн 4 зам                                        Парк хаус

In [4]:
df

Unnamed: 0,Дүүрэг,Байршил,Хотхон,Талбай (м.кв),Ашиглалтад орсон,Барилгын давхар,Хэдэн давхарт,Цонхны тоо,Цонх,Хаалга,Шал,Гараж,Үнэ
0,Хан-Уул,ХУД :: Зайсан,Энхжин хотхон,104м2,2017,9,6,4,Вакум,Бүргэд,Паркет,Байхгүй,2.9 сая ₮
1,,,,,,,,,,,,,Үнэ: Асуух
2,Баянгол,БГД :: 10-р хороолол,,,,,,,,,,,15 сая ₮
3,Баянзүрх,БЗД :: Зүүн 4 зам,Парк хаус-3,60,2013,12,12,2,Вакум,Бүргэд,Паркет,Байхгүй,2.8 сая ₮
4,Сүхбаатар,Бага тойруу,,,,,,,,,,,500 сая ₮
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616,Баянзүрх,БЗД :: Цайз зах,ТОК ТАУН хотхон,54.94,2023,16,8,2,Вакум,Бүргэд,Паркет,Байгаа,1.8 сая ₮
1617,Сонгинохайрхан,СХД :: Орбит,,,,,,,,,,,400 сая ₮
1618,Сонгинохайрхан,СХД :: 21-р хороолол,38-р байр,47.19 м²,2014,9,6,3,Вакум,Бүргэд,Паркет,Байхгүй,120 сая ₮
1619,Сүхбаатар,СБД :: 100 айл,38-р байр,50,1995,5,2,3,Вакум,Бүргэд,Паркет,Байхгүй,130 сая ₮


#### There were just too many rows with nan values, which indicates that the site has low efforts in its datakeeping, but nonetheless I dropped all the nan values and was left with 547 rows, hope its good enough for a ML model

In [5]:
columns_to_delete = ["Цонх", "Хаалга", "Шал", "Хотхон", "Байршил"]

# Drop specified columns
df_cleaned = df.drop(columns=columns_to_delete)

# Drop rows with NaN values
df_cleaned = df_cleaned.dropna()

In [6]:
df = df_cleaned

In [7]:
df

Unnamed: 0,Дүүрэг,Талбай (м.кв),Ашиглалтад орсон,Барилгын давхар,Хэдэн давхарт,Цонхны тоо,Гараж,Үнэ
0,Хан-Уул,104м2,2017,9,6,4,Байхгүй,2.9 сая ₮
3,Баянзүрх,60,2013,12,12,2,Байхгүй,2.8 сая ₮
5,Сүхбаатар,10665,2023,4,4,4,Байгаа,4.2 сая ₮
12,Сүхбаатар,74,2005,5,5,4,Байхгүй,185 сая ₮
13,Баянгол,80 m2,1988,9,8,4,Байхгүй,185 сая ₮
...,...,...,...,...,...,...,...,...
1615,Баянзүрх,61.54,2017,16,14,2,Байгаа,160 сая ₮
1616,Баянзүрх,54.94,2023,16,8,2,Байгаа,1.8 сая ₮
1618,Сонгинохайрхан,47.19 м²,2014,9,6,3,Байхгүй,120 сая ₮
1619,Сүхбаатар,50,1995,5,2,3,Байхгүй,130 сая ₮


In [8]:
df['Үнэ'].unique()

array(['2.9 сая ₮', '2.8 сая ₮', '4.2 сая ₮', '185 сая ₮', '3.4 сая ₮',
       '130 сая ₮', '160 сая ₮', '295 сая ₮', '2.5 сая ₮', '200 сая ₮',
       '147 сая ₮', '3.1 сая ₮', '115 сая ₮', '170 сая ₮', '105 сая ₮',
       '85 сая ₮', '150 сая ₮', '360 сая ₮', '2.3 сая ₮', '95 сая ₮',
       '910 сая ₮', '388 сая ₮', '93 сая ₮', '122 сая ₮', '3.2 сая ₮',
       '180 сая ₮', '385 сая ₮', '4 сая ₮', '75 сая ₮', '110 сая ₮',
       '80 сая ₮', '56 сая ₮', '76 сая ₮', '2.6 сая ₮', '155 сая ₮',
       '136 сая ₮', '135 сая ₮', '220 сая ₮', '118 сая ₮', '140 сая ₮',
       '100 сая ₮', '154 сая ₮', '3 сая ₮', '210 сая ₮', '263 сая ₮',
       '125 сая ₮', '175 сая ₮', '0.5 сая ₮', '97 сая ₮', '225 сая ₮',
       '138 сая ₮', '98 сая ₮', '4.5 сая ₮', '339 сая ₮', '3.8 сая ₮',
       '4.1 сая ₮', '190 сая ₮', '87 сая ₮', '94 сая ₮', '217 сая ₮',
       '1.4 сая ₮', '112 сая ₮', '224 сая ₮', '1.9 сая ₮', '96 сая ₮',
       '2.7 сая ₮', '4.6 сая ₮', '120 сая ₮', '3.3 сая ₮', '240 сая ₮',
       '

In [9]:
df['Үнэ'] = df['Үнэ'].str.replace(' сая ₮', '')  # Remove the unwanted text
df['Үнэ'] = pd.to_numeric(df['Үнэ'], errors='coerce')  # Convert to numbers

# Convert to millions
df['Үнэ'] = df['Үнэ'] * 1000000

In [10]:
df['Талбай (м.кв)'] = df['Талбай (м.кв)'].astype(str)  # Convert the column to string type

# Use regular expressions to extract only the numeric part
df['Талбай (м.кв)'] = df['Талбай (м.кв)'].str.extract(r'(\d+)', expand=False)

# Convert the extracted numeric part to numeric type
df['Талбай (м.кв)'] = pd.to_numeric(df['Талбай (м.кв)'], errors='coerce')

In [11]:
df['Талбай (м.кв)'].isnull().sum()

0

In [12]:
df = df.dropna()

In [13]:
columns_to_convert = ['Барилгын давхар', 'Хэдэн давхарт', 'Цонхны тоо','Ашиглалтад орсон']

# Convert each specified column to numeric
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

df['Ашиглалтад орсон'] = 2023 - df['Ашиглалтад орсон']
df.rename(columns={'Үнэ':'price','Дүүрэг':'district','Талбай (м.кв)':'area_m2','Ашиглалтад орсон':'age_building','Барилгын давхар':'building_floor','Хэдэн давхарт':'apartment_floor','Цонхны тоо':'number_windows','Гараж':'garage'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ашиглалтад орсон'] = 2023 - df['Ашиглалтад орсон']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'Үнэ':'price','Дүүрэг':'district','Талбай (м.кв)':'area_m2','Ашиглалтад орсон':'age_building','Барилгы

#### Made some changes to the data, converting the strings to integers and such, also changing column names for the ML model work

In [14]:
df

Unnamed: 0,district,area_m2,age_building,building_floor,apartment_floor,number_windows,garage,price
0,Хан-Уул,104,6,9,6,4.0,Байхгүй,2900000.0
3,Баянзүрх,60,10,12,12,2.0,Байхгүй,2800000.0
5,Сүхбаатар,106,0,4,4,4.0,Байгаа,4200000.0
12,Сүхбаатар,74,18,5,5,4.0,Байхгүй,185000000.0
13,Баянгол,80,35,9,8,4.0,Байхгүй,185000000.0
...,...,...,...,...,...,...,...,...
1615,Баянзүрх,61,6,16,14,2.0,Байгаа,160000000.0
1616,Баянзүрх,54,0,16,8,2.0,Байгаа,1800000.0
1618,Сонгинохайрхан,47,9,9,6,3.0,Байхгүй,120000000.0
1619,Сүхбаатар,50,28,5,2,3.0,Байхгүй,130000000.0


In [15]:
df.isnull().sum()

district           0
area_m2            0
age_building       0
building_floor     0
apartment_floor    0
number_windows     7
garage             0
price              0
dtype: int64

In [16]:
df = df.dropna()

In [17]:
#Turn the df into a csv file
df.to_csv('apartment_data.csv', index=False)