In [1]:
# Library
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import concurrent.futures

In [2]:
# Read the Excel database
excel_file = "Your_File.xlsx"
df = pd.read_excel(excel_file)

In [3]:
# Classify Websites
# Define a function to classify the websites
def classify_website(website):
    if isinstance(website, str):

        if 'facebook' in website.lower() or 'instagram' in website.lower() or 'twitter' in website.lower():
            return 'Social Media Presence'

        elif 'planity' in website.lower() or 'yelp' in website.lower():
            return 'Listed on Specific Platform'

        else:
            return 'Regular Website'

# Add a new column to the DataFrame to store the classification
df['Website Classification'] = df['Website'].apply(classify_website)

In [4]:
# Data Cleaning and Formating
df['RatingCount'] = df['RatingCount'].str.replace(',', '')
df['Rating'] = df['Rating'].str.replace(' out of 5', '')
df[["RatingCount","Rating"]]=df[["RatingCount","Rating"]].astype("float")

In [None]:
# Check the data type
df.info()

In [6]:
# Check data
df.head()

Unnamed: 0,Keyword,Provider,Rating,RatingCount,ServiceType,Experience,ServeArea,Provides,OpenHours,Phone,Website,Services,Address,Testimonial1,Testimonial2,Testimonial3,Website Classification
0,"Restaurants, Alpes Maritimes",Lou Castelet Restaurant,4.4,1234.0,French restaurant,,"Carros, France",,,33493290000.0,www.loucastelet.com,,"Les Plans de, 728 Rte Départementale, 06510 Ca...",,,"""Super service, very helpful staff. Great food.""",Regular Website
1,"Bar, Alpes Maritimes","FOAM Nice, Port Lympia : Bar à Bière - Craft B...",4.1,431.0,Beer hall,,"Nice, France",,,33493900000.0,www.foamnice.com,,"3 Quai des Deux Emmanuels, 06300 Nice, France",,,"""Wide variety of beers and amazing food""",Regular Website
2,"Pub, Alpes Maritimes",The Dukes Pub,3.9,534.0,Pub,,"Cannes, France",,,33493380000.0,,,"59 Rue Félix Faure, 06400 Cannes, France",,,"""Lovely atmosphere, service and drinks here wi...",
3,"Hotel , Alpes Maritimes",Hôtel Vendôme Nice,4.1,630.0,Hotel,,"Nice, France",,,33493620000.0,www.hotel-vendome-nice.com,,"26 Rue Pastorelli, 06000 Nice, France",,,"""Central to stay, near to restaurant Little Ha...",Regular Website
4,"Superette , Alpes Maritimes",Vival,3.9,11.0,Convenience store,,"Cannes, France",,,33494000000.0,magasins.vival.fr,,"140 Av. de Grasse, 06400 Cannes, France",,,,Regular Website


In [7]:
# Create a same weight for each website to compare
# Define functions for normalization and calculating the weighted score
def normalize(value, min_val, max_val):
    return (value - min_val) / (max_val - min_val)

def calculate_weighted_score(row, min_rating, max_rating, min_rating_count, max_rating_count):
    rating = row['Rating']
    rating_count = row['RatingCount']

    # Normalize rating and rating count
    normalized_rating = normalize(rating, min_rating, max_rating)
    normalized_rating_count = normalize(rating_count, min_rating_count, max_rating_count)

    # Calculate weighted score
    weighted_score = (normalized_rating + normalized_rating_count) / 2
    return weighted_score

# Extract min and max values for rating and rating count
min_rating = df['Rating'].min()
max_rating = df['Rating'].max()
min_rating_count = df['RatingCount'].min()
max_rating_count = df['RatingCount'].max()

# Calculate the weighted score for each row
df['WeightedScore'] = df.apply(calculate_weighted_score, axis=1, args=(min_rating, max_rating, min_rating_count, max_rating_count))


In [None]:
# Check data
print(df["WeightedScore"].max())
print(df["WeightedScore"].min())

In [9]:
# Check data
df.head()

Unnamed: 0,Keyword,Provider,Rating,RatingCount,ServiceType,Experience,ServeArea,Provides,OpenHours,Phone,Website,Services,Address,Testimonial1,Testimonial2,Testimonial3,Website Classification,WeightedScore
0,"Restaurants, Alpes Maritimes",Lou Castelet Restaurant,4.4,1234.0,French restaurant,,"Carros, France",,,33493290000.0,www.loucastelet.com,,"Les Plans de, 728 Rte Départementale, 06510 Ca...",,,"""Super service, very helpful staff. Great food.""",Regular Website,0.496148
1,"Bar, Alpes Maritimes","FOAM Nice, Port Lympia : Bar à Bière - Craft B...",4.1,431.0,Beer hall,,"Nice, France",,,33493900000.0,www.foamnice.com,,"3 Quai des Deux Emmanuels, 06300 Nice, France",,,"""Wide variety of beers and amazing food""",Regular Website,0.412312
2,"Pub, Alpes Maritimes",The Dukes Pub,3.9,534.0,Pub,,"Cannes, France",,,33493380000.0,,,"59 Rue Félix Faure, 06400 Cannes, France",,,"""Lovely atmosphere, service and drinks here wi...",,0.393256
3,"Hotel , Alpes Maritimes",Hôtel Vendôme Nice,4.1,630.0,Hotel,,"Nice, France",,,33493620000.0,www.hotel-vendome-nice.com,,"26 Rue Pastorelli, 06000 Nice, France",,,"""Central to stay, near to restaurant Little Ha...",Regular Website,0.423795
4,"Superette , Alpes Maritimes",Vival,3.9,11.0,Convenience store,,"Cannes, France",,,33494000000.0,magasins.vival.fr,,"140 Av. de Grasse, 06400 Cannes, France",,,,Regular Website,0.363077


In [10]:
# Data Cleaning and Formating
df['Website'] = "https://" + df['Website']
df['Website']=df['Website'].fillna("N/A")

In [None]:
## Takes about 7 minutes
# Initialize a set to keep track of visited websites
visited_websites = set()

# URL of the main page
for i, url in enumerate(df["Website"]):
    # Skip if URL is "N/A" or if it has been visited before
    if url == "N/A" or url in visited_websites:
        continue
    
    try:
        response = requests.get(url, verify=True, timeout=2)

        soup = BeautifulSoup(response.content, "html.parser")
        
        hyperlinks = soup.find_all("a")
        df.loc[i, 'hyperlinks'] = len(hyperlinks)

        text_content = soup.get_text()
        df.loc[i, 'web_text'] = len(text_content)

        num_images = soup.find_all("img")
        df.loc[i, 'web_img'] = len(num_images)

        # patterns = ['online payment', 'delivery options', 'checkout', 
        #     'add to cart', 'shopping cart', 'credit card', 
        #     'debit card', 'payment options', 'shipping address',
        #     'track order', 'place order', 'order history',
        #     'return policy', 'secure checkout', 'payment security']
        # response1 = requests.get(hyperlinks, verify=True, timeout=2)
        # soup1 = BeautifulSoup(response1.content, "html.parser")
        # text_content1 = soup1.get_text()        
        # for pattern in patterns:
        #     if pattern in text_content1:
        #         df.loc[i, 'Service_types'] = pattern
        #         break    
        #     else:
        #         df.loc[i, 'Service_types'] = "N/A"

        print(len(hyperlinks))
        print(len(text_content))
        print(len(num_images))

        df.to_excel("classified_database.xlsx", index=True)

        # Add the current URL to the set of visited websites
        # Consider websites that have problems which should consider individually
        visited_websites.add(url)
    except Exception as e:
        if url in {"https://magasins.vival.fr", "https://www.intermarche.com", "https://kfc.fr"}:
            df.loc[i, 'hyperlinks'] = "S/A"
            df.to_excel("classified_database.xlsx", index=False)
        print(f"Error processing URL at index {i}: {e}")


In [12]:
# Check data
df.head()

Unnamed: 0,Keyword,Provider,Rating,RatingCount,ServiceType,Experience,ServeArea,Provides,OpenHours,Phone,...,Services,Address,Testimonial1,Testimonial2,Testimonial3,Website Classification,WeightedScore,hyperlinks,web_text,web_img
0,"Restaurants, Alpes Maritimes",Lou Castelet Restaurant,4.4,1234.0,French restaurant,,"Carros, France",,,33493290000.0,...,,"Les Plans de, 728 Rte Départementale, 06510 Ca...",,,"""Super service, very helpful staff. Great food.""",Regular Website,0.496148,94.0,3674.0,14.0
1,"Bar, Alpes Maritimes","FOAM Nice, Port Lympia : Bar à Bière - Craft B...",4.1,431.0,Beer hall,,"Nice, France",,,33493900000.0,...,,"3 Quai des Deux Emmanuels, 06300 Nice, France",,,"""Wide variety of beers and amazing food""",Regular Website,0.412312,35.0,3991.0,59.0
2,"Pub, Alpes Maritimes",The Dukes Pub,3.9,534.0,Pub,,"Cannes, France",,,33493380000.0,...,,"59 Rue Félix Faure, 06400 Cannes, France",,,"""Lovely atmosphere, service and drinks here wi...",,0.393256,,,
3,"Hotel , Alpes Maritimes",Hôtel Vendôme Nice,4.1,630.0,Hotel,,"Nice, France",,,33493620000.0,...,,"26 Rue Pastorelli, 06000 Nice, France",,,"""Central to stay, near to restaurant Little Ha...",Regular Website,0.423795,0.0,0.0,0.0
4,"Superette , Alpes Maritimes",Vival,3.9,11.0,Convenience store,,"Cannes, France",,,33494000000.0,...,,"140 Av. de Grasse, 06400 Cannes, France",,,,Regular Website,0.363077,S/A,,


In [None]:
# Simple or complex website
# Filter out rows with 'N/A' or 'S/A' values in the 'hyperlinks' column
filtered_df = df[~df['hyperlinks'].isin(["N/A", "S/A"])]

# Fill NaN values with a placeholder (e.g., 0) before converting to integers
filtered_df['hyperlinks'] = filtered_df['hyperlinks'].fillna(0).astype(int)

# Classify websites as simple or complex based on the number of web pages
filtered_df['Structure'] = filtered_df['hyperlinks'].apply(lambda x: 'Complex' if x > 30 else 'Simple')

# Save the updated DataFrame to an Excel file
filtered_df.to_excel("classified_database.xlsx", index=False)
