# Libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import csv
import re

# Naheed Scraping

In [35]:
naheed_data= []
for i in range(1,6):
    
    url = "https://www.naheed.pk/phones-tablets/smartphones?p="+str(i)
    r = requests.get(url)

    soup = BeautifulSoup(r.text,"lxml")
    
    names = soup.find_all("a",class_ = "product-item-link")
    titles = [a['title'] for a in names]
    prices = soup.find_all("span", class_="price")
    
    for name, price in zip(names, prices):
        product_name = name['title']
        price_text = price.text
        print(f"Product Name: {product_name}")
        print(f"Product Price: {price_text}")
        naheed_data.append({"Product Name": product_name, "Product Price": price_text})


Product Name: Xiaomi Redmi Note 12S 8GB/256GB Smart Phone, Onyx Black
Product Price: Rs. 74,999.00
Product Name: Samsung Galaxy A34 5G 8/256GB Smart Phone, Awesome Silver
Product Price: Rs. 114,999.00
Product Name: Samsung Galaxy A24 8/128GB, Light Green, Mobile Set
Product Price: Rs. 79,999.00
Product Name: Xiaomi Redmi Note 12 8/128GB Smart Phone, Ice Blue
Product Price: Rs. 54,999.00
Product Name: Samsung Galaxy A34 5G 8/256GB Smart Phone, Awesome Graphite
Product Price: Rs. 114,999.00
Product Name: Samsung Galaxy A54 5G 8/256GB, Awesome White, Mobile Set
Product Price: Rs. 140,999.00
Product Name: Xiaomi Redmi Note 12 8/128GB, Onyx Gray, Mobile Set
Product Price: Rs. 54,999.00
Product Name: Xiaomi Redmi 12C 4/128GB, Graphite Gray, Mobile Set
Product Price: Rs. 27,999.00
Product Name: Samsung Galaxy A04S 4/128GB Smart Phone, Copper
Product Price: Rs. 43,999.00
Product Name: Xiaomi Redmi 12C 4/128GB, Ocean Blue, Mobile Set
Product Price: Rs. 27,999.00
Product Name: Xiaomi Redmi 12C 4

# Saving Naheed Data in excel

In [36]:
df_n = pd.DataFrame(naheed_data, columns = ["Product Name", "Product Price"])
print(df_n)

                                         Product Name   Product Price
0   Xiaomi Redmi Note 12S 8GB/256GB Smart Phone, O...   Rs. 74,999.00
1   Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...  Rs. 114,999.00
2   Samsung Galaxy A24 8/128GB, Light Green, Mobil...   Rs. 79,999.00
3   Xiaomi Redmi Note 12 8/128GB Smart Phone, Ice ...   Rs. 54,999.00
4   Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...  Rs. 114,999.00
..                                                ...             ...
90  Samsung Galaxy Z Flip 4 8GB/256GB Smartphone, ...  Rs. 333,000.00
91  Samsung Galaxy S22 Ultra 12GB/256GB Smartphone...  Rs. 425,000.00
92  Samsung Galaxy Z Flip 4 8GB/512GB Smartphone, ...  Rs. 367,000.00
93  Samsung Galaxy Z Flip 4 8GB/256GB Smartphone, ...  Rs. 333,000.00
94  Samsung Galaxy Z Fold 4 12GB/512GB Smartphone,...  Rs. 575,000.00

[95 rows x 2 columns]


In [37]:
writer_n = pd.ExcelWriter('naheed_data.xlsx')
df_n.to_excel(writer_n,'Sheet1')
writer_n.save()

# Naheed Data Cleaning

In [5]:
df_n.info

<bound method DataFrame.info of                                          Product Name   Product Price
0   Xiaomi Redmi Note 12S 8GB/256GB Smart Phone, O...   Rs. 74,999.00
1   Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...  Rs. 114,999.00
2   Samsung Galaxy A24 8/128GB, Light Green, Mobil...   Rs. 79,999.00
3   Xiaomi Redmi Note 12 8/128GB Smart Phone, Ice ...   Rs. 54,999.00
4   Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...  Rs. 114,999.00
..                                                ...             ...
90  Samsung Galaxy Z Flip 4 8GB/256GB Smartphone, ...  Rs. 333,000.00
91  Samsung Galaxy S22 Ultra 12GB/256GB Smartphone...  Rs. 425,000.00
92  Samsung Galaxy Z Flip 4 8GB/512GB Smartphone, ...  Rs. 367,000.00
93  Samsung Galaxy Z Flip 4 8GB/256GB Smartphone, ...  Rs. 333,000.00
94  Samsung Galaxy Z Fold 4 12GB/512GB Smartphone,...  Rs. 575,000.00

[95 rows x 2 columns]>

In [6]:
df_n.head()

Unnamed: 0,Product Name,Product Price
0,"Xiaomi Redmi Note 12S 8GB/256GB Smart Phone, O...","Rs. 74,999.00"
1,"Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...","Rs. 114,999.00"
2,"Samsung Galaxy A24 8/128GB, Light Green, Mobil...","Rs. 79,999.00"
3,"Xiaomi Redmi Note 12 8/128GB Smart Phone, Ice ...","Rs. 54,999.00"
4,"Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...","Rs. 114,999.00"


In [7]:
df_n["Product Price"].dtype

dtype('O')

In [8]:
column_name = "Product Price"
df_n[column_name] = df_n[column_name].str.replace('Rs.', '').str.replace(',', '', regex=True).astype(float)

  df_n[column_name] = df_n[column_name].str.replace('Rs.', '').str.replace(',', '', regex=True).astype(float)


In [9]:
df_n["Product Price"].dtype

dtype('float64')

In [10]:
df_n.head()

Unnamed: 0,Product Name,Product Price
0,"Xiaomi Redmi Note 12S 8GB/256GB Smart Phone, O...",74999.0
1,"Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...",114999.0
2,"Samsung Galaxy A24 8/128GB, Light Green, Mobil...",79999.0
3,"Xiaomi Redmi Note 12 8/128GB Smart Phone, Ice ...",54999.0
4,"Samsung Galaxy A34 5G 8/256GB Smart Phone, Awe...",114999.0


# Function for product name

In [11]:
data = df_n['Product Name']

# Define a function to apply the regex transformation
def clean_text(text):
    text = text.replace("GB/", "/")
    processed_str = re.sub(r'[^a-zA-Z0-9/ ]', '', text)
    processed_str = processed_str.replace('/','GB ')
    # Convert to lowercase
    processed_str = processed_str.lower()
    gb_indices = [i for i in range(len(processed_str)) if processed_str.startswith('gb', i)]
    if len(gb_indices) >= 2:
        second_gb_index = gb_indices[1]
        processed_str = processed_str[:second_gb_index + 2]
        
    # Remove extra spaces
    processed_str = ' '.join(processed_str.split())

    return processed_str
        

# Apply the function to the entire DataFrame
df_n['Product Name'] = data.apply(clean_text)

# modified DataFrame
print(df_n)

                           Product Name  Product Price
0       xiaomi redmi note 12s 8gb 256gb        74999.0
1       samsung galaxy a34 5g 8gb 256gb       114999.0
2          samsung galaxy a24 8gb 128gb        79999.0
3        xiaomi redmi note 12 8gb 128gb        54999.0
4       samsung galaxy a34 5g 8gb 256gb       114999.0
..                                  ...            ...
90    samsung galaxy z flip 4 8gb 256gb       333000.0
91  samsung galaxy s22 ultra 12gb 256gb       425000.0
92    samsung galaxy z flip 4 8gb 512gb       367000.0
93    samsung galaxy z flip 4 8gb 256gb       333000.0
94   samsung galaxy z fold 4 12gb 512gb       575000.0

[95 rows x 2 columns]


In [12]:
df_n.head()

Unnamed: 0,Product Name,Product Price
0,xiaomi redmi note 12s 8gb 256gb,74999.0
1,samsung galaxy a34 5g 8gb 256gb,114999.0
2,samsung galaxy a24 8gb 128gb,79999.0
3,xiaomi redmi note 12 8gb 128gb,54999.0
4,samsung galaxy a34 5g 8gb 256gb,114999.0


In [13]:
df_n.tail()

Unnamed: 0,Product Name,Product Price
90,samsung galaxy z flip 4 8gb 256gb,333000.0
91,samsung galaxy s22 ultra 12gb 256gb,425000.0
92,samsung galaxy z flip 4 8gb 512gb,367000.0
93,samsung galaxy z flip 4 8gb 256gb,333000.0
94,samsung galaxy z fold 4 12gb 512gb,575000.0


# Shophive Scraping

In [14]:
shophive_data = []
for i in range(1,3):
    url = "https://www.shophive.com/mobile-phones?p="+str(i)
    r = requests.get(url)

    soup = BeautifulSoup(r.text,"lxml")
    
    names = soup.find_all("a",class_ = "product-item-link")
    titles = [a['title'] for a in names]
    prices = soup.find_all("span", class_="price")
    
    for name, price in zip(names, prices):
        product_name_s = name['title']
        price_text_s = price.text
        print(f"Product Name: {product_name_s}")
        print(f"Product Price: {price_text_s}")
        shophive_data.append({"Product Name": product_name_s, "Product Price": price_text_s})


Product Name: Infinix Note 30 VIP 8GB 256GB
Product Price: Rs 33,999.00
Product Name: iTel S23 8GB 256GB
Product Price: Rs 29,999.00
Product Name: iTel S23 8GB 128GB
Product Price: Rs 30,999.00
Product Name: ZTE Blade A51 2GB 64GB
Product Price: Rs 21,999.00
Product Name: Honor X6 4GB 64GB
Product Price: Rs 34,999.00
Product Name: Sparx S6 2GB 32GB
Product Price: Rs 40,999.00
Product Name: Tecno Spark 10c 4GB 128GB
Product Price: Rs 16,299.00
Product Name: Infinix Smart 7 4GB 64GB
Product Price: Rs 29,999.00
Product Name: iTel A26 2GB 32GB
Product Price: Rs 25,999.00
Product Name: Sparx Neo 7 3GB 64GB
Product Price: Rs 22,499.00
Product Name: Sparx Neo 5 2GB 32GB
Product Price: Rs 24,499.00
Product Name: VNUS Sky 1 2GB 32GB
Product Price: Rs 21,499.00
Product Name: Digit Glory 1 2GB 32GB
Product Price: Rs 23,499.00
Product Name: VNUS SE22 3GB 64GB
Product Price: Rs 16,999.00
Product Name: Itel A60S 4GB 128GB
Product Price: Rs 17,499.00
Product Name: iTel It9010
Product Price: Rs 27,999

# Saving in csv

In [15]:
dfs = pd.DataFrame(shophive_data, columns = ["Product Name", "Product Price"])
print(dfs)

                     Product Name  Product Price
0   Infinix Note 30 VIP 8GB 256GB   Rs 33,999.00
1              iTel S23 8GB 256GB   Rs 29,999.00
2              iTel S23 8GB 128GB   Rs 30,999.00
3          ZTE Blade A51 2GB 64GB   Rs 21,999.00
4               Honor X6 4GB 64GB   Rs 34,999.00
5               Sparx S6 2GB 32GB   Rs 40,999.00
6       Tecno Spark 10c 4GB 128GB   Rs 16,299.00
7        Infinix Smart 7 4GB 64GB   Rs 29,999.00
8               iTel A26 2GB 32GB   Rs 25,999.00
9            Sparx Neo 7 3GB 64GB   Rs 22,499.00
10           Sparx Neo 5 2GB 32GB   Rs 24,499.00
11            VNUS Sky 1 2GB 32GB   Rs 21,499.00
12         Digit Glory 1 2GB 32GB   Rs 23,499.00
13             VNUS SE22 3GB 64GB   Rs 16,999.00
14            Itel A60S 4GB 128GB   Rs 17,499.00
15                    iTel It9010   Rs 27,999.00
16                 iTel Value 110    Rs 2,999.00
17   Oneplus 10 Pro 5G 12GB 256GB  Rs 189,999.00
18   Oneplus 10 Pro 5G 12GB 512GB  Rs 198,999.00
19       Google Pixe

In [16]:
writers = pd.ExcelWriter('shophive_data.xlsx')
dfs.to_excel(writers,'Sheet1')
writers.save()

# Shophive Data cleaning

In [17]:
dfs.info

<bound method DataFrame.info of                      Product Name  Product Price
0   Infinix Note 30 VIP 8GB 256GB   Rs 33,999.00
1              iTel S23 8GB 256GB   Rs 29,999.00
2              iTel S23 8GB 128GB   Rs 30,999.00
3          ZTE Blade A51 2GB 64GB   Rs 21,999.00
4               Honor X6 4GB 64GB   Rs 34,999.00
5               Sparx S6 2GB 32GB   Rs 40,999.00
6       Tecno Spark 10c 4GB 128GB   Rs 16,299.00
7        Infinix Smart 7 4GB 64GB   Rs 29,999.00
8               iTel A26 2GB 32GB   Rs 25,999.00
9            Sparx Neo 7 3GB 64GB   Rs 22,499.00
10           Sparx Neo 5 2GB 32GB   Rs 24,499.00
11            VNUS Sky 1 2GB 32GB   Rs 21,499.00
12         Digit Glory 1 2GB 32GB   Rs 23,499.00
13             VNUS SE22 3GB 64GB   Rs 16,999.00
14            Itel A60S 4GB 128GB   Rs 17,499.00
15                    iTel It9010   Rs 27,999.00
16                 iTel Value 110    Rs 2,999.00
17   Oneplus 10 Pro 5G 12GB 256GB  Rs 189,999.00
18   Oneplus 10 Pro 5G 12GB 512GB  Rs

In [18]:
dfs.head()

Unnamed: 0,Product Name,Product Price
0,Infinix Note 30 VIP 8GB 256GB,"Rs 33,999.00"
1,iTel S23 8GB 256GB,"Rs 29,999.00"
2,iTel S23 8GB 128GB,"Rs 30,999.00"
3,ZTE Blade A51 2GB 64GB,"Rs 21,999.00"
4,Honor X6 4GB 64GB,"Rs 34,999.00"


In [19]:
dfs["Product Price"].dtype

dtype('O')

In [20]:
column_name = "Product Price"
dfs[column_name] = dfs[column_name].str.replace('Rs.', '').str.replace(',', '', regex=True).astype(float)

  dfs[column_name] = dfs[column_name].str.replace('Rs.', '').str.replace(',', '', regex=True).astype(float)


In [21]:
dfs["Product Price"].dtype

dtype('float64')

In [22]:
dfs.head()

Unnamed: 0,Product Name,Product Price
0,Infinix Note 30 VIP 8GB 256GB,33999.0
1,iTel S23 8GB 256GB,29999.0
2,iTel S23 8GB 128GB,30999.0
3,ZTE Blade A51 2GB 64GB,21999.0
4,Honor X6 4GB 64GB,34999.0


# Function for product name

In [23]:
data_s = dfs['Product Name']

# Define a function to apply the regex transformation
def process_string(input_str):
    input_str = input_str.replace("GB/", "/")
    process_str = re.sub(r'[^a-zA-Z0-9/ ]', '', input_str)
    process_str = process_str.replace('/','GB ')
    # Convert to lowercase
    process_str = process_str.lower()
    gb_indices = [i for i in range(len(process_str)) if process_str.startswith('gb', i)]
    if len(gb_indices) >= 2:
        second_gb_index = gb_indices[1]
        process_str = process_str[:second_gb_index + 2]
        
    # Remove extra spaces
    process_str = ' '.join(process_str.split())

    return process_str
        

# Apply the function to the entire DataFrame
dfs['Product Name'] = data_s.apply(clean_text)

# Display the modified DataFrame
print(dfs)

                     Product Name  Product Price
0   infinix note 30 vip 8gb 256gb        33999.0
1              itel s23 8gb 256gb        29999.0
2              itel s23 8gb 128gb        30999.0
3          zte blade a51 2gb 64gb        21999.0
4               honor x6 4gb 64gb        34999.0
5               sparx s6 2gb 32gb        40999.0
6       tecno spark 10c 4gb 128gb        16299.0
7        infinix smart 7 4gb 64gb        29999.0
8               itel a26 2gb 32gb        25999.0
9            sparx neo 7 3gb 64gb        22499.0
10           sparx neo 5 2gb 32gb        24499.0
11            vnus sky 1 2gb 32gb        21499.0
12         digit glory 1 2gb 32gb        23499.0
13             vnus se22 3gb 64gb        16999.0
14            itel a60s 4gb 128gb        17499.0
15                    itel it9010        27999.0
16                 itel value 110         2999.0
17   oneplus 10 pro 5g 12gb 256gb       189999.0
18   oneplus 10 pro 5g 12gb 512gb       198999.0
19       google pixe

# Comparison

In [28]:
#import pandas as pd

df1 = df_n.drop_duplicates(subset='Product Name')
df2 = dfs.drop_duplicates(subset='Product Name')

# Merge DataFrames on 'Product Name'
merged_df = pd.merge(df1, df2, how='inner', on='Product Name', suffixes=('_Naheed', '_Shophive'))

# Create a new column 'min_price' with the minimum price
merged_df['min_price'] = merged_df[['Product Price_Naheed', 'Product Price_Shophive']].min(axis=1)

# print DataFrame
print(merged_df[['Product Name', 'Product Price_Naheed', 'Product Price_Shophive', 'min_price']])


                    Product Name  Product Price_Naheed  \
0     xiaomi redmi 12c 4gb 128gb               27999.0   
1  samsung galaxy a04s 4gb 128gb               43999.0   

   Product Price_Shophive  min_price  
0                 33999.0    27999.0  
1                 22999.0    22999.0  


In [29]:
merged_df.head()

Unnamed: 0,Product Name,Product Price_Naheed,Product Price_Shophive,min_price
0,xiaomi redmi 12c 4gb 128gb,27999.0,33999.0,27999.0
1,samsung galaxy a04s 4gb 128gb,43999.0,22999.0,22999.0


In [32]:
merged_df['recommended_website'] = merged_df.apply(lambda row: 'https://www.naheed.pk/phones-tablets/smartphones' if row['min_price'] == row['Product Price_Naheed']
                                                  else 'https://www.shophive.com/mobile-phones', axis=1)


print(merged_df[['Product Name', 'Product Price_Naheed', 'Product Price_Shophive', 'min_price', 'recommended_website']])

                    Product Name  Product Price_Naheed  \
0     xiaomi redmi 12c 4gb 128gb               27999.0   
1  samsung galaxy a04s 4gb 128gb               43999.0   

   Product Price_Shophive  min_price  \
0                 33999.0    27999.0   
1                 22999.0    22999.0   

                                recommended_website  
0  https://www.naheed.pk/phones-tablets/smartphones  
1            https://www.shophive.com/mobile-phones  


In [33]:
merged_df.head()

Unnamed: 0,Product Name,Product Price_Naheed,Product Price_Shophive,min_price,recommended_website
0,xiaomi redmi 12c 4gb 128gb,27999.0,33999.0,27999.0,https://www.naheed.pk/phones-tablets/smartphones
1,samsung galaxy a04s 4gb 128gb,43999.0,22999.0,22999.0,https://www.shophive.com/mobile-phones
