In [1]:
# CONNECT GOOGLE DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

# Below libraries are for text processing using NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [3]:
df = pd.read_csv('/content/drive/MyDrive/BISA AI/Techspace/TeachSpace_FinalDataset.csv')

In [4]:
df = df.drop(['Unnamed: 0.1'], axis=1)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,judul,isi,tanggal,penulis,kategori,photo
0,0,Apa Itu Flowchart di Microsoft Word dan Bagaim...,Flowchart adalah diagram yang berfungsi untuk ...,2022-02-01,Soffya Ranti,Apps & OS,https://asset.kompas.com/crops/fcZysSqNqNz5nQs...
1,1,Cara Membayar Pajak Motor dan Mobil Online lew...,Membayar pajak motor atau mobil kini semakin m...,2022-02-01,Soffya Ranti,Apps & OS,https://asset.kompas.com/crops/hwMsAe0LBU6Gvw0...
2,2,Begini Cara Bayar Listrik Prabayar dan Pascaba...,BCA Mobile atau disingkat mBCA merupakan aplik...,2022-02-01,Zulfikar Hardiansyah,e-Business,https://asset.kompas.com/crops/NK49Dn6rTbIfvJe...
3,3,Cara Daftar Grab Bike Online dan Link Pendafta...,Menjadi mitra pengemudi Grab Bike bisa menjadi...,2022-02-01,Soffya Ranti,e-Business,https://asset.kompas.com/crops/Mh0olD9SsfYKrm-...
4,4,Cara Memperbesar Ukuran File PDF Halaman all,"Dalam beberapa hal tertentu, bisa saja membutu...",2022-02-01,Soffya Ranti,Apps & OS,https://asset.kompas.com/crops/Q69W4WdRMzj5Cdf...


In [None]:
print('Banyak data article: ', len(df.judul.unique()))
print('Banyak kategori: ', len(df.kategori.unique()))
print('Jenis kategori: ', df.kategori.unique())

Banyak data article:  4197
Banyak kategori:  9
Jenis kategori:  ['Apps & OS' 'e-Business' 'Gadget' 'Internet' 'Hardware' 'Telco' 'Games'
 'Elektronik' 'Advertorial Tekno']


In [None]:
fig = go.Figure([go.Bar(x=df["kategori"].value_counts().index, y=df["kategori"].value_counts().values)])
fig['layout'].update(title={"text" : 'Distribution of articles category-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Category name",yaxis_title="Number of articles")
fig.update_layout(width=700,height=500)
fig

# Content Based Filtering

In [None]:
# Membuang data duplikat pada variabel preparation
preparation = df
preparation.sort_values('judul')
preparation = preparation.drop_duplicates('judul')
preparation

Unnamed: 0.1,Unnamed: 0,judul,isi,tanggal,penulis,kategori,photo
0,0,Apa Itu Flowchart di Microsoft Word dan Bagaim...,Flowchart adalah diagram yang berfungsi untuk ...,2022-02-01,Soffya Ranti,Apps & OS,https://asset.kompas.com/crops/fcZysSqNqNz5nQs...
1,1,Cara Membayar Pajak Motor dan Mobil Online lew...,Membayar pajak motor atau mobil kini semakin m...,2022-02-01,Soffya Ranti,Apps & OS,https://asset.kompas.com/crops/hwMsAe0LBU6Gvw0...
2,2,Begini Cara Bayar Listrik Prabayar dan Pascaba...,BCA Mobile atau disingkat mBCA merupakan aplik...,2022-02-01,Zulfikar Hardiansyah,e-Business,https://asset.kompas.com/crops/NK49Dn6rTbIfvJe...
3,3,Cara Daftar Grab Bike Online dan Link Pendafta...,Menjadi mitra pengemudi Grab Bike bisa menjadi...,2022-02-01,Soffya Ranti,e-Business,https://asset.kompas.com/crops/Mh0olD9SsfYKrm-...
4,4,Cara Memperbesar Ukuran File PDF Halaman all,"Dalam beberapa hal tertentu, bisa saja membutu...",2022-02-01,Soffya Ranti,Apps & OS,https://asset.kompas.com/crops/Q69W4WdRMzj5Cdf...
...,...,...,...,...,...,...,...
4193,4193,Apple Dilaporkan Bikin Fitur Berbayar Model NFC,CUPERTINO Apple dilaporkan sedang membangun...,2022-01-31,Intan Rakhmayanti Dewi,Telco,https://pict-c.sindonews.net/dyn/620/pena/news...
4194,4194,Waspada! Jangan Lagi Posting Anak Pakai Seraga...,SYDNEY Memposting foto anak dengan seragam se...,2022-01-31,Yudi Setyowibowo,Telco,https://pict-c.sindonews.net/dyn/620/pena/news...
4195,4195,Cara Memperbarui WhatsApp ke Versi Terbaru 2022,JAKARTA Cara memperbarui WhatsApp ke versi t...,2022-01-31,Intan Rakhmayanti Dewi,Telco,https://pict-c.sindonews.net/dyn/620/pena/news...
4196,4196,Terapkan 7 Kebiasaan ini untuk Proteksi Penggu...,"JAKARTA Di era digital seperti saat ini, ana...",2022-01-31,Intan Rakhmayanti Dewi,Telco,https://pict-c.sindonews.net/dyn/620/pena/news...


In [None]:
# Mengonversi data series ‘judul’ menjadi dalam bentuk list
judul = preparation['judul'].tolist()
 
# Mengonversi data series ‘isi’ menjadi dalam bentuk list
isi = preparation['isi'].tolist()
 
# Mengonversi data series ‘kategori’ menjadi dalam bentuk list
kategori = preparation['kategori'].tolist()

# Mengonversi data series ‘tanggal’ menjadi dalam bentuk list
tanggal = preparation['tanggal'].tolist()
 
print(len(judul))
print(len(isi))
print(len(kategori))
print(len(tanggal))

4197
4197
4197
4197


In [None]:
# Membuat dictionary untuk data ‘judul’, ‘isi’, dan ‘kategori’
article_new = pd.DataFrame({
    'judul': judul,
    'isi': isi,
    'kategori': kategori,
    'tanggal': tanggal
})
article_new

Unnamed: 0,judul,isi,kategori,tanggal
0,Apa Itu Flowchart di Microsoft Word dan Bagaim...,Flowchart adalah diagram yang berfungsi untuk ...,Apps & OS,2022-02-01
1,Cara Membayar Pajak Motor dan Mobil Online lew...,Membayar pajak motor atau mobil kini semakin m...,Apps & OS,2022-02-01
2,Begini Cara Bayar Listrik Prabayar dan Pascaba...,BCA Mobile atau disingkat mBCA merupakan aplik...,e-Business,2022-02-01
3,Cara Daftar Grab Bike Online dan Link Pendafta...,Menjadi mitra pengemudi Grab Bike bisa menjadi...,e-Business,2022-02-01
4,Cara Memperbesar Ukuran File PDF Halaman all,"Dalam beberapa hal tertentu, bisa saja membutu...",Apps & OS,2022-02-01
...,...,...,...,...
4192,Apple Dilaporkan Bikin Fitur Berbayar Model NFC,CUPERTINO Apple dilaporkan sedang membangun...,Telco,2022-01-31
4193,Waspada! Jangan Lagi Posting Anak Pakai Seraga...,SYDNEY Memposting foto anak dengan seragam se...,Telco,2022-01-31
4194,Cara Memperbarui WhatsApp ke Versi Terbaru 2022,JAKARTA Cara memperbarui WhatsApp ke versi t...,Telco,2022-01-31
4195,Terapkan 7 Kebiasaan ini untuk Proteksi Penggu...,"JAKARTA Di era digital seperti saat ini, ana...",Telco,2022-01-31


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(article_new['judul'])

In [None]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]

    df = pd.DataFrame({'publish_date': article_new['tanggal'][indices].values,
               'headline':article_new['judul'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel(),
                'Kategori':article_new['kategori'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',article_new['judul'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,]
tfidf_based_model(132, 11)

headline :  Huawei P50 Pro Resmi di Indonesia, Ponsel Snapdragon 888 4G Harga Rp 15 Juta Halaman all



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article,Kategori
1,2021-11-08,"Unboxing Xiaomi 11T Pro, Ponsel ""Snapdragon 88...",0.915402,Gadget
2,2021-10-07,"Vivo X70 Pro Resmi di Indonesia, Harga Rp 10,9...",1.075834,Gadget
3,2022-03-24,"Oppo Reno 7 4G Meluncur di Indonesia, Harga Rp...",1.081107,Gadget
4,2022-02-12,Huawei P50 Pro Masuk Indonesia Tanpa 5G dan La...,1.114257,Gadget
5,2022-02-02,Mengenal Teknologi Kamera DualMatrix di Huawei...,1.136711,Gadget
6,2022-03-31,"Samsung Galaxy A33 5G Resmi di Indonesia, Harg...",1.146966,Gadget
7,2022-03-01,Asus 8z Resmi Meluncur dengan Chip Snapdragon ...,1.148996,Gadget
8,2022-01-21,"Samsung Galaxy Tab A8 Resmi di Indonesia, Harg...",1.149941,Gadget
9,2021-12-13,Huawei Watch GT Runner Bisa Dipesan di Indones...,1.165633,Gadget
10,2021-12-24,"Ponsel Lipat Huawei P50 Pocket Dirilis, Ini Sp...",1.177102,Gadget


In [None]:
tfidf_headline_features.shape

(4197, 6343)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
 
# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_headline_features) 
cosine_sim

array([[1.        , 0.05786172, 0.05419989, ..., 0.02929839, 0.        ,
        0.        ],
       [0.05786172, 1.        , 0.05329399, ..., 0.0288087 , 0.        ,
        0.        ],
       [0.05419989, 0.05329399, 1.        , ..., 0.02698551, 0.        ,
        0.        ],
       ...,
       [0.02929839, 0.0288087 , 0.02698551, ..., 1.        , 0.        ,
        0.0696102 ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.0696102 , 0.        ,
        1.        ]])

In [None]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa judul article
cosine_sim_df = pd.DataFrame(cosine_sim, index=article_new['judul'], columns=article_new['judul'])
print('Shape:', cosine_sim_df.shape)
 
# Melihat similarity matrix pada setiap kategori
cosine_sim_df.sample(20, axis=1).sample(10, axis=0)

Shape: (4197, 4197)


judul,PANDI Luncurkan Website Khusus untuk Para Atlet Indonesia,Cara Download Sertifikat Vaksin Booster dengan Mudah,Semarakkan Lebaranmu dengan OPPO A Series,"Kesan Pertama Menjajal Samsung S22 Ultra, Kini dengan Stylus SPen Bawaan Halaman all","Hari Bumi 2022, Google Doodle Peringati Dampak Mengerikan Perubahan Iklim","Video: Review Samsung Galaxy S22 Plus Harga Rp 15 Jutaan, Semenarik Apa? Halaman all",YouTube Music Punya Fitur Rekap Tahunan Serupa Spotify Wrapped Halaman all,Penjara di New Mexico Lumpuh Total Diserang Ransomware,"Mengenal Cara Kerja Wireless Charging, Ternyata Ada Sejak 100 Tahun Lalu","Saingi TikTok, Pengguna Shorts Kini Bisa Comot Video dari YouTube","Penyebab Chat WhatsApp Centang Satu Terus, Begini Cara Mengatasinya!","Vivo Y15s Resmi Meluncur, Ponsel Android Go Harga Rp 1 Jutaan Halaman all","Jual Foto Selfie di NFT, Ghozali Sukses Kantongi Rp13 Miliar dalam Waktu Singkat","Google Assistant Kini Bisa Dibungkam Kalau Lagi ""Ngoceh"" Halaman all","Sejarah Singkat Simbol ""@"", Apa Artinya? Halaman all",Ponsel 5G Semakin Diminati Masyarakat Indonesia Halaman all,"Duo Flagship Xiaomi 12 Series Dijual Mulai Rp10 Juta, Apa Keunggulannya?",Sejumlah Pengguna Twitter Indonesia Keluhkan Munculnya Notifikasi Konten Vulgar Halaman all,Layanan Digital Telkomsel Dipamerkan di Dubai Expo 2020 Halaman all,"MPL ID Season 9 Dimulai 18 Februari, Ini Daftar Tim yang Bertanding Halaman all"
judul,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Cara Mengunci Tulisan di Word agar Tidak Bisa Dicopas Orang Lain Halaman all,0.0,0.023765,0.0,0.013009,0.0,0.014386,0.013059,0.013737,0.017377,0.038995,0.019923,0.015682,0.012132,0.052169,0.017051,0.019032,0.0,0.013223,0.029818,0.014157
Kominfo Pastikan 5G Telkomsel dan XL Axiata Hadir di MotoGP Mandalika Halaman all,0.0,0.0,0.0,0.013635,0.0,0.015078,0.013688,0.014398,0.0,0.0,0.0,0.016437,0.012716,0.014759,0.017872,0.082689,0.0,0.013859,0.097969,0.014839
"Oppo A53 Varian 4/128 GB Resmi Masuk di Indonesia, Ini Harganya Halaman all",0.035997,0.0,0.069544,0.013761,0.0,0.015218,0.013814,0.014531,0.0,0.0,0.0,0.079445,0.012834,0.014895,0.018037,0.064459,0.0,0.044783,0.031543,0.042245
Pendapatan YouTube Lampaui Netflix berkat Iklan Halaman all,0.0,0.0,0.0,0.014411,0.0,0.015937,0.092658,0.0,0.0,0.08632,0.0,0.017373,0.0,0.015599,0.018889,0.021084,0.0,0.014648,0.016218,0.015683
"Warganet Indonesia Serbu OpenSea, Jual NFT Foto KTP hingga Makanan Rp 3,8 Miliar Halaman all",0.030272,0.0,0.0,0.011573,0.0,0.062274,0.011617,0.0,0.0,0.0,0.0,0.067886,0.245824,0.012526,0.015169,0.054207,0.0,0.03766,0.013024,0.012594
Sutradara Andri Cung Buka Rahasia Bikin Web Series lewat Smartphone,0.0,0.0,0.101331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072625,0.0,0.0,0.0
"Spesifikasi Xiaomi Watch S1 dan Watch S1 Active, Smartwatch Rp 2 Jutaan Halaman all",0.0,0.0,0.0,0.010027,0.0,0.124522,0.010065,0.0,0.0,0.0,0.0,0.135742,0.0,0.010853,0.013142,0.014669,0.044124,0.010191,0.011284,0.010912
Facebook Luncurkan Fitur Belanja di Grup dan Live Shopping Halaman all,0.097293,0.0,0.0,0.015151,0.0,0.016754,0.065143,0.015998,0.0,0.0,0.0,0.018264,0.01413,0.016399,0.019859,0.022166,0.0,0.0154,0.034728,0.016488
"4 Cara Pembayaran UTBKSBMPTN 2022 lewat Bank Mandiri, BTN, BRI, BNI Halaman all",0.0,0.021318,0.0,0.011669,0.02754,0.012904,0.011714,0.0,0.015587,0.0,0.017872,0.014067,0.0,0.012631,0.015295,0.017072,0.0,0.011861,0.013133,0.012699
"Diprotes Warga Tangerang, Google Tinjau Ulang Prosedur Pemetaan Street View Halaman all",0.0,0.0,0.0,0.010169,0.030174,0.011245,0.010208,0.0,0.0,0.0,0.0,0.012258,0.0,0.046054,0.013328,0.014877,0.0,0.010336,0.011444,0.011066


In [None]:
def articles_recommendations(judul, similarity_data=cosine_sim_df, items=article_new[['judul', 'kategori']], k=10):
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan    
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,judul].to_numpy().argpartition(
        range(-1, -k, -1))
    
    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
    # Drop judul agar judul yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(judul, errors='ignore')
 
    return pd.DataFrame(closest).merge(items).head(k)

In [None]:
article_new[article_new.judul.eq('Diprotes Warga Tangerang, Google Tinjau Ulang Prosedur Pemetaan Street View Halaman all')]

Unnamed: 0,judul,isi,kategori,tanggal
3403,"Diprotes Warga Tangerang, Google Tinjau Ulang ...",Pihak Google Indonesia mengungkapkan akan mela...,Internet,2021-10-28


In [None]:
# Rekomendasi 10 Article berdasarkan kategori
data_rekomendasi=articles_recommendations('Saingi TikTok, Pengguna Shorts Kini Bisa Comot Video dari YouTube')
data_rekomendasi

Unnamed: 0,judul,kategori
0,"Contek YouTube, Pengguna TikTok Kini Bisa Disl...",Telco
1,Kirim Video YouTube di Snapchat Kini Bisa Paka...,Telco
2,"Instagram Lakukan Perombakan Fitur Video, Sain...",Telco
3,"2 Tahun, Jumlah Tayangan YouTube Shorts Tembus...",Telco
4,"Durasi Video TikTok Diperpanjang, Kini Bisa sa...",Apps & OS
5,YouTube dan TikTok Paling Banyak Kumpulkan Dat...,Telco
6,Cara Upload Video ke YouTube dari Laptop dan H...,Apps & OS
7,Cara Menyimpan Video dari Youtube ke Galeri,Telco
8,Pengguna TikTok di Rusia Tidak Lagi Bisa Lives...,Telco
9,"Makin Populer, YouTube Shorts Capai 5 Triliun ...",Telco
