# Introduction and Setup

This project involves generating link lists, scraping data from Amazon and Alibaba, and creating datasets. The focus is on constructing a correlation matrix and subsequently ranking arbitrage opportunities based on correlations (primarily the same brand).

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Use this link to change to your custom header:https://httpbin.org/get (Grab your User-Agent)
custom_headers ={'###Your User agent###'}

#Creating Link Lists

In [None]:
#ALIBABA
from googlesearch import search

# Define the query for Google search. inurl:dp for products.
query = "bathroom scale site:alibaba.com inurl:product-detail"

# Perform a Google search and get the URLs
alibaba_links = list(search(query, num=50, stop=100, pause=2))

# Display the links
for i, link in enumerate(alibaba_links, start=1):
    print(f"{i}. {link}")

In [None]:
#AMAZON
from googlesearch import search
# Define the query for Google search. inurl:dp for products.
# Changing this query to anything else is the simpliest way to use this product.
query = "bathroom scale site:amazon.ca inurl:dp"

# Perform a Google search and get the URLs
amazon_links = list(search(query, num=50, stop=300, pause=2))

# Display the links
for i, link in enumerate(amazon_links, start=1):
    print(f"{i}. {link}")

In [5]:
#LINK TESTING - DO THEY WORK
# Explanation: Since many links may not work, we need to filter and retain only the valid ones.

import requests
from googlesearch import search

# Function to filter valid links
def filter_valid_links(links):
    valid_links = []

     # Iterate through the links and check their validity
    for i, link in enumerate(links, start=1):
        try:
            response = requests.get(link, timeout=5)

            # Check if the response status code is 200 (OK)
            if response.status_code == 200:
                valid_links.append(link)
                #print(f"{i}. {link} - Status Code: {response.status_code}")
                pass
            else:
                #print(f"{i}. {link} - Status Code: {response.status_code}. Skipping.")
                pass

        except requests.RequestException as e:
            #print(f"{i}. {link} - Error: {e}. Skipping.")
            pass

    return valid_links


# Filter valid links using the defined function
valid_links_az = filter_valid_links(amazon_links)


# Create a Pandas DataFrame from the valid links
validlinks_df_az = pd.DataFrame({"Valid Links": valid_links_az})
validlinks_df_az

Unnamed: 0,Valid Links
0,https://www.amazon.ca/Precision-Bathroom-Measu...
1,https://www.amazon.ca/GymCope-Capacity-Precisi...
2,https://www.amazon.ca/Active-Era-Bathroom-Prec...
3,https://www.amazon.ca/Malama-Precision-Bathroo...
4,https://www.amazon.ca/Battery-Tempered-Bathroo...
...,...
61,https://www.amazon.ca/Starfrit-093826-Electron...
62,https://www.amazon.ca/Bluetooth-Composition-Tr...
63,https://www.amazon.ca/arboleaf-Bluetooth-Compo...
64,https://www.amazon.ca/WyzeCam-Accurate-Bathroo...


# Scraping Amazon

In [6]:
def scrape_amazon_product(url):
    try:
      # Send a request to the provided URL with custom headers
        page = requests.get(url, headers=custom_headers)
        soup = BeautifulSoup(page.content, "html.parser")

        # Extract product information from the page
        title_element = soup.find(id='productTitle')
        # Extract the product title or set to "N/A" if not found
        title = title_element.get_text().strip() if title_element else "N/A"

        # Extract the product price or set to "N/A" if not found
        price_element = soup.find(class_='a-offscreen')
        price = price_element.get_text().strip()[1:] if price_element else "N/A"

        # Extracting the reviews
        review_element = soup.find(id='averageCustomerReviews')
        review_element

        # Extracting the number of reviews
        num_reviews_element = review_element.find('span', {'id': 'acrCustomerReviewText'})
        num_reviews = num_reviews_element.text.strip() if num_reviews_element else "N/A"

        # Extracting the average rating
        avg_rating_element = review_element.find('span', {'class': 'a-icon-alt'})
        avg_rating = avg_rating_element.text.strip() if avg_rating_element else "N/A"

        # Extracting the special features
        special_features_element = soup.find(class_='a-section a-spacing-small a-spacing-top-small')

        # Initialize variables for brand, color, and special features
        brand = color = special_features = None

        # Check if the special features element is found
        if special_features_element:
            # Find all the rows within the table
            rows = special_features_element.find_all('tr')

            # Loop through each row and extract the information
            for row in rows:
                header_cell = row.find('td', class_='a-span3')
                data_cell = row.find('td', class_='a-span9')

                if header_cell and data_cell:
                    header_text = header_cell.text.strip()
                    data_text = data_cell.text.strip()

                    if header_text == 'Brand':
                        brand = data_text
                    elif header_text == 'Colour':
                        color = data_text
                    elif header_text == 'Special feature':
                        special_features = data_text

        vendor_element = soup.find(id='merchantInfoFeature_feature_div')
        # Initialize vendor variable
        vendor = "N/A"

        # Check if the vendor element is found
        if vendor_element:
            # Find the span with class "a-size-small" within the vendor element
            vendor_span = vendor_element.find('span', class_='a-size-small offer-display-feature-text-message')

            # Check if the span is found
            if vendor_span:
                # Get the text content of the span
                vendor = vendor_span.get_text(strip=True)


        # Create a Pandas DataFrame to store the extracted data
        data = {
            'Title': [title],
            'Price': [price],
            'Brand': [brand],
            'Vendor': [vendor],
            'Colour': [color],
            'Special Features': [special_features],
            'Ratings': [avg_rating],
            'Number of reviews': [num_reviews],
            'URL': [url]
        }
        df = pd.DataFrame(data)

        return df

    except Exception as e:
      # Handle exceptions and print an error message
        print(f"Error processing URL {url}: {e}")
        return pd.DataFrame()

# Example usage:
test = scrape_amazon_product('https://www.amazon.ca/Malama-Precision-Bathroom-Technology-Measurements/dp/B07MFTT87J?th=1')

# Display the DataFrame
test.head()

Unnamed: 0,Title,Price,Brand,Vendor,Colour,Special Features,Ratings,Number of reviews,URL
0,Malama Precision Digital Body Weight Bathroom ...,19.95,Malama & Maple Leaf Design,ECHELLO Wholesale,Black,"Backlit Display, Auto Shut Off, Bodyweight",4.6 out of 5 stars,"3,064 ratings",https://www.amazon.ca/Malama-Precision-Bathroo...


# Scraping AliBaba

In [7]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def extract_attributes_and_more_from_url(url):
    # Make a request to the URL and create BeautifulSoup object
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    # Extract title
    title = soup.find(class_='product-title-container')
    title = title.h1.text.strip() if title else "N/A"

    # Extract price
    price_container = soup.find(class_='price')
    price = price_container.getText().strip() if price_container else "Price not found"

    # Find the attribute-layout container
    attribute_layout_container = soup.find(class_='attribute-layout')

    # Initialize extracted_attributes with None for all attributes
    extracted_attributes = {attribute: None for attribute in ["Place of Origin", "Product", "Supply Ability"]}

    # Check if the container is found
    if attribute_layout_container:
        # Loop through each attribute item and extract the information
        for attribute_item in attribute_layout_container.find_all(class_='attribute-item'):
            left_text = attribute_item.find(class_='left')
            right_text = attribute_item.find(class_='right')

            # Check if both left_text and right_text are not None
            if left_text and right_text:
                left_text = left_text.text.strip()
                right_text = right_text.text.strip()

                # Check if the left_text matches any of the specified attributes
                for attribute in extracted_attributes:
                    if attribute in left_text:
                        extracted_attributes[attribute] = right_text

    else:
        print("Attribute-layout container not found" + url)

    # Extract supplier information
    suppliers_container = soup.find(class_='company-name')
    suppliers = suppliers_container.getText().strip() if suppliers_container else "N/A"
    supplier_href_container = suppliers_container.a if suppliers_container else None
    supplier_href = supplier_href_container.get('href') if supplier_href_container else "N/A"

    supplier_rating_container = soup.find(class_='company-basicCapacity')
    store_rating_value = "N/A"
    if supplier_rating_container:
        store_rating_div = supplier_rating_container.find('div', {'class': 'attr-title'}, string='Store rating')
        store_rating_value = store_rating_div.find_next_sibling('div', {'class': 'attr-content'}).get('title') if store_rating_div else "N/A"

    data = {
    'Title': title,
    'Price': price,
    'Link': url,
    'Place of Origin': extracted_attributes["Place of Origin"],
    'Product': extracted_attributes["Product"],
    'Supplier': suppliers,
    'Supply Ability': extracted_attributes["Supply Ability"],
    'Supplier Link': supplier_href,
    'Store Rating': store_rating_value
    }

    # Convert the dictionary to a Pandas DataFrame
    df = pd.DataFrame([data])

    return df


# Example usage with one of the Alibaba links
url1 = "https://www.alibaba.com/product-detail/Commercial-Coffee-Grinder-Electric-Coffee-Grinder_62551610449.html"
result_df_ali = extract_attributes_and_more_from_url(url1)

# Print the resulting DataFrame
result_df_ali

Unnamed: 0,Title,Price,Link,Place of Origin,Product,Supplier,Supply Ability,Supplier Link,Store Rating
0,Commercial Coffee Grinder Electric Coffee Grin...,$100.00/piece,https://www.alibaba.com/product-detail/Commerc...,"Guangdong, China",Coffee Grinder,"Jiangmen OuHuiTe Hardware Products Co., Ltd",10000 Set/Sets per Month,https://wanhuimanufacturing.en.alibaba.com/min...,4.4/5


In [8]:
def ali_baba_baby_scraper(url):
    if not url:
        return pd.Series({'Title': 'N/A', 'Price_ali': 'N/A', 'Product': 'N/A', 'Brand_Name': 'N/A', 'StoreRating_ali': 'N/A'})

    # Make a request to the URL and create BeautifulSoup object
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    # Extract title
    title = soup.find(class_='product-title-container')
    title = title.h1.text.strip() if title else "N/A"

    # Extract price
    price_container = soup.find(class_='price-list')
    price = price_container.find(class_='price').getText().strip() if (price_container and price_container.find(class_='price')) else "N/A"

    # Find the attribute-layout container
    attribute_layout_container = soup.find(class_='attribute-layout')

    # Initialize extracted_attributes with None for all attributes
    extracted_attributes = {"Product name": None, "Brand Name": None}

    # Check if the container is found
    if attribute_layout_container:
        # Loop through each attribute item and extract the information
        for attribute_item in attribute_layout_container.find_all(class_='attribute-item'):
            left_text = attribute_item.find(class_='left')
            right_text = attribute_item.find(class_='right')

            # Check if both left_text and right_text are not None
            if left_text and right_text:
                left_text = left_text.text.strip()
                right_text = right_text.text.strip()

                # Check if the left_text matches any of the specified attributes
                for attribute in extracted_attributes:
                    if attribute in left_text:
                        extracted_attributes[attribute] = right_text

    # Extract supplier information
    supplier_rating_container = soup.find(class_='company-basicCapacity')
    store_rating_value = "N/A"
    if supplier_rating_container:
        store_rating_div = supplier_rating_container.find('div', {'class': 'attr-title'}, string='Store rating')
        store_rating_value = store_rating_div.find_next_sibling('div', {'class': 'attr-content'}).get('title') if store_rating_div else "N/A"

    data = {
        'Title': title,
        'Price_ali': price,
        'Product': extracted_attributes["Product name"],
        'Brand_Name': extracted_attributes["Brand Name"],
        'StoreRating_ali': store_rating_value
    }

    return pd.Series(data)

# Example usage with one of the Alibaba links
url1 = "https://www.alibaba.com/product-detail/180kg-Battery-Body-Weight-Scale-Slim_1600971173298.html"
result_df_ali = ali_baba_baby_scraper(url1)

# Print the resulting DataFrame
result_df_ali

Title              180kg Battery Body Weight Scale Slim Electroni...
Price_ali                                                      $5.30
Product                                     Bathroom weighing scales
Brand_Name                                                    BALACI
StoreRating_ali                                                4.6/5
dtype: object

# Creating Data Sets

In [10]:
#AMAZON
# Initiate the list
scraped_data_list_az = []

# Iterate through each valid link in the DataFrame
for url in validlinks_df_az['Valid Links']:
    # Call the function to scrape Amazon product details from the valid URL's
    scraped_data = scrape_amazon_product(url)

    # Check if the scraping was successful (not None)
    if scraped_data is not None:
        scraped_data_list_az.append(scraped_data)

# Combine the scraped data into a DataFrame
scraped_data_list_az_result = pd.concat(scraped_data_list_az, ignore_index=True)
amazon_scales = scraped_data_list_az_result

Error processing URL https://www.amazon.ca/Etekcity-Digital-Bathroom-Measure-Tempered/dp/B01HI1W1V4: 'NoneType' object has no attribute 'find'
Error processing URL https://www.amazon.ca/Lepulse-Electrodes-Percentage-Precision-Composition/dp/B0CFTKQ7ML: 'NoneType' object has no attribute 'find'
Error processing URL https://www.amazon.ca/TAYLOR-TAP7506-Chrome-Lithium-Digital/dp/B0000E2PEI: 'NoneType' object has no attribute 'find'
Error processing URL https://www.amazon.ca/Starfrit-093826-Electronic-Bathroom-Scale/dp/B00CPKKD3C: 'NoneType' object has no attribute 'find'


In [11]:
# ALIBABA
# Initiate the list
scraped_data_list = []

# Iterate through each valid link in the DataFrame
for url in alibaba_links:
    # Call the function to scrape Alibaba product details from the valid URL's
    scraped_data = ali_baba_baby_scraper(url)

    # Check if the scraping was successful (not None)
    if scraped_data is not None:
        scraped_data_list.append(scraped_data)

# Combine the scraped data into a DataFrame
result_df_ali_scrape = pd.DataFrame(scraped_data_list)

# Print the resulting DataFrame
result_df_ali_scrape

Unnamed: 0,Title,Price_ali,Product,Brand_Name,StoreRating_ali
0,Personal Digital Bathroom Scale,$4.30,BATHROOM SCALE,JUSTOP,4.8/5
1,Bathroom Scale Electronic Small Weight Scale B...,$3.90 - $4.80,,,
2,Smart App Bathroom Scale 180kg/396lb Smart LCD...,$7.50,,OEM,4.8/5
3,EB2056 High quality Tempered glass digital per...,$4.50,Digital Bathroom Scale,Kinlee,-
4,Electronic Bathroom scale-EB2032,$5.00,Electronic Bathroom scale-EB2032,Kinlee,-
...,...,...,...,...,...
95,TSEC Fashion Design Body Fat Scale Digital Bat...,$4.30,,tiansheng,4.9/5
96,Vuelos de baño electrónica escala de peso pant...,$6.00,weight scale,,
97,Bathroom Scale Body Weight Scale 180kg Electro...,$2.20,Bathroom Scale Weight,YIQUAN,5.0/5
98,0.001g 0.1g 100g-3kg New precision digital ele...,$66.00 - $115.00,,HOCHOICE,


In [12]:
result_df_ali_scrape['Link_ali']=alibaba_links
# Replace "N/A" with None in the StoreRating_ali column
result_df_ali_scrape['StoreRating_ali'].replace('N/A', None, inplace=True)

# Drop rows with None values in the StoreRating_ali column
result_df_ali_scrape.dropna(subset=['StoreRating_ali'], inplace=True)

# Reset the index
result_df_ali_scrape.reset_index(drop=True, inplace=True)

result_df_ali_scrape

Unnamed: 0,Title,Price_ali,Product,Brand_Name,StoreRating_ali,Link_ali
0,Personal Digital Bathroom Scale,$4.30,BATHROOM SCALE,JUSTOP,4.8/5,https://www.alibaba.com/product-detail/Persona...
1,Smart App Bathroom Scale 180kg/396lb Smart LCD...,$7.50,,OEM,4.8/5,https://www.alibaba.com/product-detail/Smart-A...
2,EB2056 High quality Tempered glass digital per...,$4.50,Digital Bathroom Scale,Kinlee,-,https://www.alibaba.com/product-detail/EB2056-...
3,Electronic Bathroom scale-EB2032,$5.00,Electronic Bathroom scale-EB2032,Kinlee,-,https://www.alibaba.com/product-detail/Electro...
4,Bathroom Scale TSEC Hot Sale Electronic Bluet...,$7.50,Bathroom scale,Tiansheng with bluetooth scale,4.9/5,https://www.alibaba.com/product-detail/Bathroo...
...,...,...,...,...,...,...
61,Professional Electronic Body Weight Digital Scale,$4.50,,RUIJIAN,4.6/5,https://www.alibaba.com/product-detail/Profess...
62,Electronic Stainless Steel Food Weight Scale W...,$109.00,,whitebird,4.4/5,https://www.alibaba.com/product-detail/Electro...
63,LED Display bathroom weight scale 180kg digita...,$2.50,,huiding,5.0/5,https://www.alibaba.com/product-detail/LED-Dis...
64,TSEC Fashion Design Body Fat Scale Digital Bat...,$4.30,,tiansheng,4.9/5,https://www.alibaba.com/product-detail/TSEC-Fa...


In [13]:
import re

# Function to clean and convert Price_ali to numeric values
def clean_and_convert_price(price_str):
    try:
        # Ensure the input is treated as a string
        price_str = str(price_str)

        # Extract numeric values from the string
        prices = [float(val) for val in re.findall(r'\d+\.\d+', price_str) if '.' in val]

        # Check if there are any numeric values
        if not prices:
            return None

        # Check if it's a range and calculate the mean
        if len(prices) > 1:
            return sum(prices) / len(prices)
        else:
            return prices[0]
    except Exception as e:
        print(f"Error: {e}, Price_str: {price_str}")
        return None

In [14]:
result_df_ali_scrape['Price_ali'] = result_df_ali_scrape['Price_ali'].apply(clean_and_convert_price)
result_df_ali_scrape

Unnamed: 0,Title,Price_ali,Product,Brand_Name,StoreRating_ali,Link_ali
0,Personal Digital Bathroom Scale,4.3,BATHROOM SCALE,JUSTOP,4.8/5,https://www.alibaba.com/product-detail/Persona...
1,Smart App Bathroom Scale 180kg/396lb Smart LCD...,7.5,,OEM,4.8/5,https://www.alibaba.com/product-detail/Smart-A...
2,EB2056 High quality Tempered glass digital per...,4.5,Digital Bathroom Scale,Kinlee,-,https://www.alibaba.com/product-detail/EB2056-...
3,Electronic Bathroom scale-EB2032,5.0,Electronic Bathroom scale-EB2032,Kinlee,-,https://www.alibaba.com/product-detail/Electro...
4,Bathroom Scale TSEC Hot Sale Electronic Bluet...,7.5,Bathroom scale,Tiansheng with bluetooth scale,4.9/5,https://www.alibaba.com/product-detail/Bathroo...
...,...,...,...,...,...,...
61,Professional Electronic Body Weight Digital Scale,4.5,,RUIJIAN,4.6/5,https://www.alibaba.com/product-detail/Profess...
62,Electronic Stainless Steel Food Weight Scale W...,109.0,,whitebird,4.4/5,https://www.alibaba.com/product-detail/Electro...
63,LED Display bathroom weight scale 180kg digita...,2.5,,huiding,5.0/5,https://www.alibaba.com/product-detail/LED-Dis...
64,TSEC Fashion Design Body Fat Scale Digital Bat...,4.3,,tiansheng,4.9/5,https://www.alibaba.com/product-detail/TSEC-Fa...


In [15]:
exchange_rate_usd_to_cad = 1.27  # Replace this with the actual exchange rate

# Convert 'Price_ali' from USD to CAD
result_df_ali_scrape['Price_ali_cad'] = result_df_ali_scrape['Price_ali'] * exchange_rate_usd_to_cad

# Correlation Matrix

Creating data sets

In [16]:
# Creation the data tables for the correlation Matrix
amazon_scales_corr_mini = amazon_scales[['Brand', 'Title']]
ali_scrape_corr_mini = result_df_ali_scrape[['Brand_Name', 'Title']]

In [None]:
# Replace "N/A" with None in the Brand and Title columns
amazon_scales_corr_mini.loc[:, 'Brand'] = amazon_scales_corr_mini['Brand'].replace('N/A', None)
amazon_scales_corr_mini.loc[:, 'Title'] = amazon_scales_corr_mini['Title'].replace('N/A', None)

# Drop rows with None values in the Brand and Title columns
amazon_scales_corr_mini = amazon_scales_corr_mini.dropna(subset=['Brand', 'Title'])

# Reset the index if needed
amazon_scales_corr_mini.reset_index(drop=True, inplace=True)


In [None]:
# Replace "N/A" with None in the Brand and Title columns
ali_scrape_corr_mini.loc[:, 'Brand_Name'] = ali_scrape_corr_mini['Brand_Name'].replace('N/A', None)
ali_scrape_corr_mini.loc[:, 'Title'] = ali_scrape_corr_mini['Title'].replace('N/A', None)

# Drop rows with None values in the Brand_Name and Title columns
ali_scrape_corr_mini = ali_scrape_corr_mini.dropna(subset=['Brand_Name', 'Title'])

# Reset the index if needed
ali_scrape_corr_mini.reset_index(drop=True, inplace=True)

Performing correlations on the data sets

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def create_correlation_matrix(df_ali, df_amazon, brand_weight=0.5, title_weight=0.5):
    # Convert the text from the Brand_Name and Title columns to matrices of word counts
    ali_brand_vectorizer = CountVectorizer().fit(df_ali['Brand_Name'])
    ali_title_vectorizer = CountVectorizer().fit(df_ali['Title'])

    amazon_brand_vectorizer = CountVectorizer(vocabulary=ali_brand_vectorizer.vocabulary_).fit(df_amazon['Brand'])
    amazon_title_vectorizer = CountVectorizer(vocabulary=ali_title_vectorizer.vocabulary_).fit(df_amazon['Title'])

    ali_brand_matrix = ali_brand_vectorizer.transform(df_ali['Brand_Name'])
    ali_title_matrix = ali_title_vectorizer.transform(df_ali['Title'])

    amazon_brand_matrix = amazon_brand_vectorizer.transform(df_amazon['Brand'])
    amazon_title_matrix = amazon_title_vectorizer.transform(df_amazon['Title'])

    # Get the cosine similarity for brand and title
    brand_similarity = cosine_similarity(ali_brand_matrix, amazon_brand_matrix)
    title_similarity = cosine_similarity(ali_title_matrix, amazon_title_matrix)

    # Combine brand and title similarities with weights
    combined_scores = brand_weight * brand_similarity + title_weight * title_similarity

    # Create a DataFrame with the correlation matrix
    correlation_matrix = pd.DataFrame(combined_scores, index=df_ali.index, columns=df_amazon.index)

    return correlation_matrix



In [None]:
# Create the correlation matrix
correlation_matrix = create_correlation_matrix(ali_scrape_corr_mini, amazon_scales_corr_mini)

# Display the correlation matrix
correlation_matrix

In [None]:
import pandas as pd


max_correlation_data = pd.DataFrame(columns=['Column', 'Max_Correlation_Value', 'Corresponding_Row_Title', 'Corresponding_Column_Title'])

for column in correlation_matrix.columns:
    max_row_index = correlation_matrix[column].idxmax()
    max_correlation_value = correlation_matrix[column].max()

    max_row_title = ali_scrape_corr_mini.loc[max_row_index, 'Title']
    max_column_title = amazon_scales_corr_mini.loc[column, 'Title']

    max_correlation_data = max_correlation_data.append({
        'Column': column,
        'Max_Correlation_Value': max_correlation_value,
        'Corresponding_Row_Title': max_row_title,
        'Corresponding_Column_Title': max_column_title
    }, ignore_index=True)

# Display the DataFrame with maximum correlation values and corresponding titles
max_correlation_data

#Creating Final data set with correlations

In [None]:

# Merge the DataFrames on 'Corresponding_Column_Title' and 'Column'
merged_df = pd.merge(amazon_scales, max_correlation_data, how='left', left_on='Title', right_on='Corresponding_Column_Title')

amz_ali_scales= amazon_scales
# Add the relevant information to amazon_scales as amz_ali_scales
amz_ali_scales['ali_title'] = merged_df['Corresponding_Row_Title']
amz_ali_scales['Correlation'] = merged_df['Max_Correlation_Value']

# Display the modified amazon_scales DataFrame
amz_ali_scales

In [23]:
# Merge the two DataFrames based on ali_title
merged_data = pd.merge(amz_ali_scales, result_df_ali_scrape, left_on='ali_title', right_on='Title', how='left')

amz_ali_scales['Price_ali_cad'] = merged_data['Price_ali_cad']
amz_ali_scales['Link_ali'] = merged_data['Link_ali']

In [None]:
output_data = amz_ali_scales[["Title", "Price","URL", "ali_title", "Price_ali_cad", "Link_ali","Correlation"]]
output_data["Price_Diff"] = output_data["Price"].astype(float) - output_data["Price_ali_cad"].astype(float)
output_data

In [None]:
output_data_ranked = output_data.sort_values(by="Correlation", ascending=False)
output_data_ranked = output_data_ranked.reset_index(drop=True)
output_data_ranked

#Downloading

In [None]:
#Changing the HVAC
from google.colab import drive

drive.mount('/content/drive')

# Save the shrunk dataset to a new CSV file
amz_ali_scales.to_csv('/content/drive/MyDrive/###Your File Path###/amz_ali_scales.csv', index=False, sep=',')

# Provide a download link for the file
from google.colab import files
files.download('/content/drive/MyDrive/Colab Notebooks/data/amz_ali_scales.csv')

# Result

In [None]:
# The most significant correlation was for this alibaba link https://www.alibaba.com/product-detail/Lepulse-P1-Body-Scale-Fat-Percentage_1600503656139.html and this amazon link https://www.amazon.ca/Lepulse-Electrodes-Percentage-Precision-Composition/dp/B0CFTKQ7ML
# This would have an arbitrage opportunity of 65.06 CAD assuming shipping parity
output_data_ranked

Unnamed: 0,Title,Price,URL,ali_title,Price_ali_cad,Link_ali,Correlation,Price_Diff
0,"Lepulse Body Fat Scale, 8 Electrodes Weighing ...",139.99,https://www.amazon.ca/Lepulse-Electrodes-Perce...,Lepulse P1 Body Scale Fat Percentage Analyzer ...,74.93,https://www.alibaba.com/product-detail/Lepulse...,0.888589,65.06
1,"Body Fat Scale, Lepulse Balance Poids Large Di...",79.99,https://www.amazon.ca/Lepulse-Accurate-Bluetoo...,Lepulse P1 Body Scale Fat Percentage Analyzer ...,74.93,https://www.alibaba.com/product-detail/Lepulse...,0.812527,5.06
2,"arboleaf Scale Body Weight, Smart Weight Scale...",49.99,https://www.amazon.ca/arboleaf-Bluetooth-Compo...,High Quality Digital Bathroom Smart Body Fat W...,11.303,https://www.alibaba.com/product-detail/High-Qu...,0.394116,38.687
3,Malama Digital Bathroom Scale Body Weight Scal...,24.97,https://www.amazon.ca/Malama-Bathroom-Weighing...,Canny digital weighing scale household weight...,5.7785,https://www.alibaba.com/product-detail/Canny-d...,0.387836,19.1915
4,"RENPHO Digital Body Weight Bathroom Scale, Hig...",27.99,https://www.amazon.ca/RENPHO-Digital-Bathroom-...,LED Display bathroom weight scale 180kg digita...,3.175,https://www.alibaba.com/product-detail/LED-Dis...,0.379291,24.815
5,"WYZE Scale Digital Bathroom Body Weight Scale,...",59.99,https://www.amazon.ca/Bluetooth-Composition-Tr...,Constant-593S body fat scale Smart APP 180Kg ...,14.17955,https://www.alibaba.com/product-detail/Constan...,0.371742,45.81045
6,RENPHO Elis 1 Bluetooth Body Fat Scale Smart D...,27.29,https://www.amazon.ca/RENPHO-Bluetooth-Body-We...,Constant-593S body fat scale Smart APP 180Kg ...,14.17955,https://www.alibaba.com/product-detail/Constan...,0.371688,13.11045
7,"RENPHO Digital Body Weight Bathroom Scale, Hig...",18.99,https://www.amazon.ca/RENPHO-Digital-Bathroom-...,LED Display bathroom weight scale 180kg digita...,3.175,https://www.alibaba.com/product-detail/LED-Dis...,0.371305,15.815
8,KUNOVA (TM) Digital Bathroom Weight Body Scale...,17.99,https://www.amazon.ca/Dr-Digital-Bathroom-Tech...,LED Display bathroom weight scale 180kg digita...,3.175,https://www.alibaba.com/product-detail/LED-Dis...,0.36997,14.815
9,Active Era Digital Body Weight Scale - Ultra S...,24.99,https://www.amazon.ca/Active-Era-Bathroom-Prec...,Household Tempered Glass Personal Scale 180kg ...,3.6195,https://www.alibaba.com/product-detail/Househo...,0.3669,21.3705
