In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import nltk
import string
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

In [6]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Fungsi untuk membersihkan teks
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # menghapus mentions
    text = re.sub(r'#', '', text)  # menghapus hashtag
    text = re.sub(r'http\S+', '', text)  # menghapus link
    text = re.sub(r'[0-9]+', '', text)  # menghapus angka
    text = text.replace('\n', ' ')  # mengganti newline dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation))  # menghapus tanda baca
    text = text.strip()  # menghapus spasi di awal dan akhir teks
    return text

# Fungsi untuk mengubah teks menjadi huruf kecil
def casefoldingText(text):
    text = text.lower()
    return text

# Fungsi untuk melakukan tokenisasi teks
def tokenizingText(text):
    text = word_tokenize(text)
    return text

# Fungsi untuk menghapus stopwords dalam teks
def filteringText(text):
    # Gabungan stopwords bahasa Indonesia dan Inggris
    listStopwords = set(stopwords.words('indonesian')) | set(stopwords.words('english'))
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    return filtered

In [4]:
#menggabungkan fitur dengan category dan nama
df_products_cleaned = pd.read_excel('products_cleaned.xlsx')

df_products_cleaned['Combined_Features'] = (
    df_products_cleaned['Category'] + ' ' +
    df_products_cleaned['Name']
)

In [7]:
# Membuat list kosong untuk menyimpan hasil tokenisasi
tokenized_features = []

# Looping melalui setiap elemen dalam kolom 'Combined_Features'
for feature in df_products_cleaned['Combined_Features']:
    # Membersihkan teks
    cleaned_text = cleaningText(feature)
    # Mengubah huruf menjadi kecil (case folding)
    casefolded_text = casefoldingText(cleaned_text)
    # Melakukan tokenisasi
    tokenized_text = tokenizingText(casefolded_text)
    # Menghapus stopwords
    filtered_text = filteringText(tokenized_text)

    # Menyimpan hasil filtering ke dalam list
    tokenized_features.append(filtered_text)

# Menambahkan hasil filtering ke DataFrame sebagai kolom baru
df_products_cleaned['Combined_Tokenized'] = tokenized_features

# Menampilkan hasil akhir
df_products_cleaned


Unnamed: 0,Name,Price,Discount,Category,Brand,Min Purchase,Size Dimensions,Seller,Ratings,Combined_Features,Combined_Tokenized
0,LINE REGULATOR TYPE Of GAS COMPRESSED AIR INCL...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ..."
1,LINE REGULATOR Of GAS METHANE INCLUDE JASA PEM...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR Of GAS ME...","[bahan, kimia, gas, bumi, line, regulator, gas..."
2,LINE REGULATOR TYPE Of GAS NITROUS OXIDE INCLU...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ..."
3,LINE REGULATOR TYPE Of GAS AMMONIA INCLUDE JAS...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ..."
4,LINE REGULATOR TYPE Of GAS CARBON MONOXIDE INC...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ..."
...,...,...,...,...,...,...,...,...,...,...,...
2669,Pohon,123456,No discount found,"Wedding, Decoration, Florist",No brand found,No min purchase found,No size dimensions found,ROYAL MULIA GROUP,0,"Wedding, Decoration, Florist Pohon","[wedding, decoration, florist, pohon]"
2670,Hand Gloves Polkadot Putih (Per Lusin),45000,No discount found,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...",Lokal,1 pcs,10x8x1cm\n(Berat volume: 0.01kg),PT Caturindo Bersama Cemerlang,4.7,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...","[wedding, perhiasan, aksesori, hiasan, sarung,..."
2671,Polkadot Gloves Putih Bintik Kuning,3500,No discount found,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...",Lokal,1 pcs,10x7x2cm\n(Berat volume: 0.02kg),PT Caturindo Bersama Cemerlang,4.7,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...","[wedding, perhiasan, aksesori, hiasan, sarung,..."
2672,GRAND TJOKRO - PLN 2184,858500,No discount found,"Wedding, Venue, Villa & Resort",No brand found,No min purchase found,No size dimensions found,KW travel,5,"Wedding, Venue, Villa & Resort GRAND TJOKRO - ...","[wedding, venue, villa, resort, grand, tjokro,..."


In [8]:
#menghilangkan koma dan membuat nya jadi string
df_products_cleaned['Combined_Tokenized_Str'] = df_products_cleaned['Combined_Tokenized'].apply(lambda x: ' '.join(x))
df_products_cleaned

Unnamed: 0,Name,Price,Discount,Category,Brand,Min Purchase,Size Dimensions,Seller,Ratings,Combined_Features,Combined_Tokenized,Combined_Tokenized_Str
0,LINE REGULATOR TYPE Of GAS COMPRESSED AIR INCL...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ...",bahan kimia gas bumi line regulator type gas c...
1,LINE REGULATOR Of GAS METHANE INCLUDE JASA PEM...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR Of GAS ME...","[bahan, kimia, gas, bumi, line, regulator, gas...",bahan kimia gas bumi line regulator gas methan...
2,LINE REGULATOR TYPE Of GAS NITROUS OXIDE INCLU...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ...",bahan kimia gas bumi line regulator type gas n...
3,LINE REGULATOR TYPE Of GAS AMMONIA INCLUDE JAS...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ...",bahan kimia gas bumi line regulator type gas a...
4,LINE REGULATOR TYPE Of GAS CARBON MONOXIDE INC...,5760000,10%,"Bahan Kimia, Gas Bumi",AFK,1 pcs,25x25x20cm\n(Berat volume: 2.08kg),PELANGI GASINDO NUSANTARA,0,"Bahan Kimia, Gas Bumi LINE REGULATOR TYPE Of G...","[bahan, kimia, gas, bumi, line, regulator, typ...",bahan kimia gas bumi line regulator type gas c...
...,...,...,...,...,...,...,...,...,...,...,...,...
2669,Pohon,123456,No discount found,"Wedding, Decoration, Florist",No brand found,No min purchase found,No size dimensions found,ROYAL MULIA GROUP,0,"Wedding, Decoration, Florist Pohon","[wedding, decoration, florist, pohon]",wedding decoration florist pohon
2670,Hand Gloves Polkadot Putih (Per Lusin),45000,No discount found,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...",Lokal,1 pcs,10x8x1cm\n(Berat volume: 0.01kg),PT Caturindo Bersama Cemerlang,4.7,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...","[wedding, perhiasan, aksesori, hiasan, sarung,...",wedding perhiasan aksesori hiasan sarung tanga...
2671,Polkadot Gloves Putih Bintik Kuning,3500,No discount found,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...",Lokal,1 pcs,10x7x2cm\n(Berat volume: 0.02kg),PT Caturindo Bersama Cemerlang,4.7,"Wedding, Perhiasan & Aksesori, Hiasan & Sarung...","[wedding, perhiasan, aksesori, hiasan, sarung,...",wedding perhiasan aksesori hiasan sarung tanga...
2672,GRAND TJOKRO - PLN 2184,858500,No discount found,"Wedding, Venue, Villa & Resort",No brand found,No min purchase found,No size dimensions found,KW travel,5,"Wedding, Venue, Villa & Resort GRAND TJOKRO - ...","[wedding, venue, villa, resort, grand, tjokro,...",wedding venue villa resort grand tjokro pln


In [9]:
#mengubah feature menjadi matriks vektor menggunakan tf-idf
vectorizer = TfidfVectorizer()
combined_features_matrix = vectorizer.fit_transform(df_products_cleaned['Combined_Tokenized_Str'])

#menghitung cosine similarity pada matriks
cosine_sim = cosine_similarity(combined_features_matrix)

In [17]:
def get_cheaper_recommendations(product_index, cosine_sim, sim_threshold):
    selected_product_price = df_products_cleaned.iloc[product_index]['Price']

    # Mendapatkan skor kesamaan untuk produk yang dipilih
    sim_scores = []
    for i in range(len(cosine_sim[product_index])):
        sim_scores.append((i, cosine_sim[product_index][i]))

    # Mengurutkan skor kesamaan secara menurun (mirip dari yang tertinggi ke yang terendah)
    for i in range(len(sim_scores)):
        for j in range(i + 1, len(sim_scores)):
            if sim_scores[j][1] > sim_scores[i][1]:
                temp = sim_scores[i]
                sim_scores[i] = sim_scores[j]
                sim_scores[j] = temp

    # Menyimpan produk yang memiliki harga lebih rendah dari produk yang dipilih dan sim score di atas threshold
    cheaper_similar_products = []
    for i in range(1, len(sim_scores)):  # Mulai dari 1 untuk melewatkan produk yang dipilih sendiri
        product_idx = sim_scores[i][0]
        product_price = df_products_cleaned.iloc[product_idx]['Price']
        product_sim_score = sim_scores[i][1]

        # Filter berdasarkan harga yang lebih rendah dan sim score yang memenuhi threshold
        if  product_sim_score :
            cheaper_similar_products.append(product_idx)

        if len(cheaper_similar_products) == 5:  # Batasi jumlah rekomendasi hingga 5 produk
            break

    # Mengembalikan 5 produk teratas dengan harga lebih murah dan sim score di atas threshold
    result = []
    for idx in cheaper_similar_products:
        result.append(df_products_cleaned.iloc[idx])

    return result

testing (jumlah dataset dapat mempengaruhi item yang direkomendasikan)

In [18]:
product_title = input("enter title: ")

#cari produk berdasarkan tittle
matching_products = []
for idx in range(len(df_products_cleaned)):
    if product_title.lower() in df_products_cleaned.iloc[idx]['Name'].lower():
        matching_products.append((idx, df_products_cleaned.iloc[idx]))

#menampilkan produk berdasarkan inputan
print("products:")
for idx, product in matching_products:
    print(f"Index: {idx}, Name: {product['Name']}, Price: {product['Price']}")

#memilih index products
if len(matching_products) > 0:
    product_index = int(input("Enter the index of the product you want to select: "))

    cheaper_recommended_products = get_cheaper_recommendations(product_index, cosine_sim, 0.4)
    print("Here are the cheaper products similar to the one you selected:")
    for product in cheaper_recommended_products:
        print(product[['Name', 'Price']])
else:
    print("No products matched your search.")

enter title: batik
products:
Index: 319, Name: Atasan Batik Anak, Price: 60000
Index: 337, Name: Set Kebaya Batik Anak Modern, Price: 250000
Index: 338, Name: Dress Batik Anak, Price: 150000
Index: 385, Name: Peci Kopiah Songkok Batik Kalimantan Varian Hijau, Price: 57000
Index: 386, Name: Peci Kopiah Songkok Batik Kalimantan Varian Merah, Price: 57000
Index: 387, Name: Peci Kopiah Songkok Batik Kalimantan Varian Hitam, Price: 57000
Index: 388, Name: Peci Kopiah Songkok Batik Kalimantan Varian Biru, Price: 57000
Index: 394, Name: Hijab Batik Syar'i 130*130 Ukiran Warna, Price: 135000
Index: 395, Name: Hijab Batik Ornamen Silver/Gold 110*110, Price: 150000
Index: 396, Name: Hijab Batik Ukuran Warna, Price: 125000
Index: 397, Name: Hijab Batik Borneo Ukiran Warna, Price: 125000
Index: 437, Name: Batik, Price: 250000
Enter the index of the product you want to select: 397
Here are the cheaper products similar to the one you selected:
Name     Hijab Batik Syar'i 130*130 Ukiran Warna
Price  