In [2]:
# Dependencies
import requests        # For making HTTP requests
from bs4 import BeautifulSoup  # For web scraping
import pandas as pd    # For data manipulation and analysis
import numpy as np     # For numerical operations
import time            # For handling time-related functionalities


In [3]:
# Create empty lists to store data
cost_list = []
title_list = []
describe_list = []
location_list = []
bedrooms_list = []
bathrooms_list = []
furnished_list = []
space_list = []


This code is a web scraping script designed to extract information about houses and apartments for sale from the website 'jiji.co.ke'. The script iterates over a range of page numbers to navigate through different pages of the website. For each page, it constructs the URL, sends an HTTP GET request with specific headers, and retrieves the HTML content of the page.

The script then uses the BeautifulSoup library to parse the HTML and extract information such as the cost, title, description, location, and various attributes of each house or apartment listing. The extracted information is stored in separate lists (cost_list, title_list, describe_list, location_list, bedrooms_list, bathrooms_list, furnished_list, space_list), with each list corresponding to a specific piece of information.

The structure of the website is inspected to handle variations in the number of attributes provided for each listing. Depending on the number of attributes, the script populates the lists accordingly, handling cases where information might be missing.

If the HTTP request is successful (status code 200), the script extracts the relevant information from the page's HTML content. If there's an issue with the request (e.g., non-200 status code), the script pauses execution for 10 seconds before continuing, likely to avoid overloading the server or to handle potential network issues.

It's important to note that web scraping may be subject to the terms of service of the website, and the script's behavior should comply with those terms. Additionally, the structure of the website may change over time, requiring updates to the script.

In [4]:
for page in range(1, 1500):  # Update the range as needed
    url = f'https://jiji.co.ke/houses-apartments-for-sale?page={page}'
    
    print(page)
        
    payload = {}
    
    headers = {
    'authority': 'jiji.co.ke',
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': 'first_visit=1701322312; app=ccffb01e729d4374be4ce68d6c5a0b51; uid=65681e485c2c466e12968c42d8f5d0653cbdae3d; lang=en; _gcl_au=1.1.749201793.1701361943; _fbp=fb.2.1701361949583.1677315291; LISTING_VIEW_TYPE_LOCALSTORAGE_KEY=list; g_state={"i_p":1701455647503,"i_l":2}; MgidSensorNVis=33; MgidSensorHref=https://jiji.co.ke/parklands-highridge/houses-apartments-for-sale/4bdrm-apartment-in-parklands-for-sale-2JRMt2oYHwjInSVJRep6aWcC.html?page=1&pos=12&cur_pos=12&ads_per_page=20&ads_count=8205&lid=AlhDhjmUMMScdgij&indexPosition=11; __gads=ID=615862c1a82df299:T=1701322342:RT=1701342174:S=ALNI_MYCnpphl5BD1UZ0f-CVfp11GLTdig; __gpi=UID=00000ce1c39a1576:T=1701322342:RT=1701342174:S=ALNI_MbnzIke45pUb7nibJNzU5bM3nA7Zw; rid=jiji.co.ke; app=ccffb01e729d4374be4ce68d6c5a0b51; rid=jiji.co.ke; uid=65681e485c2c466e12968c42d8f5d0653cbdae3d',
    'referer': 'https://jiji.co.ke/parklands-highridge/houses-apartments-for-sale/4bdrm-apartment-in-parklands-for-sale-2JRMt2oYHwjInSVJRep6aWcC.html?page=1&pos=12&cur_pos=12&ads_per_page=20&ads_count=8205&lid=AlhDhjmUMMScdgij&indexPosition=11',
    'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    'x-csrf-token': 'ImI1MzAxMzFjMzU2MGNjMzNmNTNkYTNiZWViNjViZGQwZTRiNDA3MzAi.ZWhtQg.aVUr18Z8mWnzeP0F5-gj8skDmYM'
    }

    response = requests.request("GET", url, headers=headers, data=payload)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, "html.parser")    
        # Extract data from the page
        cost = soup.findAll('div', class_="qa-advert-price")
        for costs in cost:
            cost_list.append(costs.text.strip())
                
        title = soup.findAll('div', class_="b-advert-title-inner qa-advert-title b-advert-title-inner--div")
        for titles in title:
            title_list.append(titles.text.strip())
                
        describe = soup.findAll('div', class_="b-list-advert-base__description-text")
        for description in describe:
            describe_list.append(description.text.strip())
                
        location = soup.findAll('span', class_ = 'b-list-advert__region__text')
        for locations in location:
            location_list.append(locations.text.strip())
            
        house_info = soup.findAll('div', class_ = 'b-list-advert-base__attrs')
        
        for i in range(len(house_info)):
            attrs = house_info[i].findAll('div', class_='b-list-advert-base__item-attr')
            
            if len(attrs) == 1:
                bedrooms_list.append(attrs[0].get_text(strip=True))
                bathrooms_list.append('nan')
                furnished_list.append('nan')
                space_list.append('nan')
            
            elif len(attrs) == 2:
                bedrooms_list.append(attrs[0].get_text(strip=True))
                bathrooms_list.append(attrs[1].get_text(strip=True))
                furnished_list.append('nan')
                space_list.append('nan')
            elif len(attrs) == 3:
                bedrooms_list.append(attrs[0].get_text(strip=True))
                bathrooms_list.append(attrs[1].get_text(strip=True))
                furnished_list.append(attrs[2].get_text(strip=True))
                space_list.append('nan')
            elif len(attrs) == 4:
                bedrooms_list.append(attrs[0].get_text(strip=True))
                bathrooms_list.append(attrs[1].get_text(strip=True))
                furnished_list.append(attrs[2].get_text(strip=True))
                space_list.append(attrs[3].get_text(strip=True))
            else:
                print(page,attrs)
    else:
        time.sleep(10)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148


Explanation:

data: It's a Python dictionary where each key represents a column name, and the corresponding value is a pandas Series (a one-dimensional labeled array) containing the data for that column.

Keys and Values:

'Cost', 'Title', 'Description', ..., 'Space': These are the column names of the DataFrame.
pd.Series(cost_list), pd.Series(title_list), ..., pd.Series(space_list): These are lists containing the data for each respective column.
pd.DataFrame(data): This line creates a DataFrame by passing the dictionary data to the pd.DataFrame() constructor. Each key in the dictionary becomes a column in the DataFrame, and the corresponding values become the data in those columns.

df: This is the resulting DataFrame, containing columns such as 'Cost', 'Title', 'Description', etc., with the data provided in the respective lists.

In summary, this code snippet is a convenient way to organize lists of data into a structured tabular format, creating a pandas DataFrame for further analysis or manipulation.

In [None]:
# Create a DataFrame
data = {
    'Cost': pd.Series(cost_list),
    'Title': pd.Series(title_list),
    'Description': pd.Series(describe_list),
    'Location': pd.Series(location_list),
    'Bedrooms': pd.Series(bedrooms_list),
    'Bathrooms': pd.Series(bathrooms_list),
    'Furnished': pd.Series(furnished_list),
    'Space': pd.Series(space_list)
}

df = pd.DataFrame(data)

df

NameError: name 'pd' is not defined

In [None]:
# check the null in the data
df.isnull().sum()

Unnamed: 0,Cost,Title,Description,Location,Bedrooms,Bathrooms,Furnished,Space
0,"KSh 7,800,000",2bdrm Apartment in Kileleshwa for sale,"Newly built two bedroom apartment for sale, Lo...","Nairobi, Kileleshwa",2 bedrooms,2 bathrooms,Unfurnished,98sqm
1,"KSh 12,000,000",2bdrm Apartment in Valley Arcade for sale,FEATURES AND AMENITIES\n● Fully equipped gym\n...,"Nairobi, Lavington",2 bedrooms,2 bathrooms,Unfurnished,133sqm
2,"KSh 15,200,000","3bdrm Apartment in Urban Oasis, Maziwa for sale",You will instantly love being part of this lux...,"Nairobi, Lavington",3 bedrooms,3 bathrooms,Unfurnished,140sqm
3,"KSh 6,000,000",2bdrm Apartment in Mtwapa Gardens for sale,Spacious Two bedroom for sale,"Kilifi, Mtwapa",2 bedrooms,1 bathroom,Unfurnished,90sqm
4,"KSh 7,800,000","3bdrm Bungalow in Matangi, Ruiru for sale",Residential... \nThis is a 3 bedroom all ensui...,"Kiambu, Ruiru",3 bedrooms,4 bathrooms,Unfurnished,2400sqm
...,...,...,...,...,...,...,...,...
9966,"KSh 50,000,000","5bdrm Maisonette in Garden Estate, Thome for sale",5 Bedrooms House (all ensuite) Sitting on a 0....,"Nairobi, Thome / Nairobi",,,,
9967,"KSh 80,000,000",5bdrm Maisonette in Karen for sale,A 5 Bedrooms House sitting on a 1.1 Acre Plot ...,"Nairobi, Karen",,,,
9968,"KSh 16,500,000",Furnished 4bdrm Villa in Malindi for sale,SPECIAL OFFER For sale 4bdm villa in Malindi s...,"Kilifi, Malindi",,,,
9969,"KSh 11,000,000",3bdrm Apartment in Nyali Mkomani for sale,Home in the heart of Nyali!\n- 2 min Walking d...,"Mombasa, Nyali",,,,


In [None]:
def clean_dataframe(df):
    # Drop rows with null values
    df_cleaned = df.dropna()
    
    # Drop duplicate rows
    df_cleaned = df_cleaned.drop_duplicates()
    
    # Reset the index after dropping rows
    df_cleaned = df_cleaned.reset_index(drop=True)
    
    return df_cleaned

In [None]:
cleaned_df = clean_dataframe(df)

In [None]:
cleaned_df.shape

In [None]:
cleaned_df.to_csv('DATA\\finalData.csv', index=False)