In [105]:
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('sqlite:////home/azkaf/newspaper3k_library/berita_database.db',echo=False)

In [106]:
df = pd.read_sql_table('berita',engine, columns=['label','text'])

In [107]:
df.shape

(2293, 2)

In [108]:
df.label.value_counts()

wisata       304
teknologi    283
olahraga     267
otomotif     265
bisnis       259
kuliner      243
kesehatan    237
berita       228
sains        207
Name: label, dtype: int64

In [109]:
df.dropna(inplace=True)

In [110]:
print(df.shape)

(2226, 2)


In [111]:
print(df.label.value_counts())

wisata       304
teknologi    270
otomotif     265
olahraga     253
bisnis       246
kuliner      241
kesehatan    232
berita       211
sains        204
Name: label, dtype: int64


In [112]:
df.head()

Unnamed: 0,label,text
0,berita,"PATI, KOMPAS.com - Kepala Kepolisian Resor Pat..."
1,berita,"JAKARTA, KOMPAS.com - Perayaan Natal Bersama P..."
2,berita,"JAKARTA, KOMPAS.com - Ketua Majelis Permusyawa..."
3,berita,"SUMEDANG, KOMPAS.com - Jumat (28/12/2018) mala..."
4,berita,"JAKARTA, KOMPAS.com - Sebanyak 22 staf PT Mass..."


In [113]:
# convert label to numerical label
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
df['label_num'] = LE.fit_transform(df.label)

In [114]:
LE.classes_

array(['berita', 'bisnis', 'kesehatan', 'kuliner', 'olahraga', 'otomotif',
       'sains', 'teknologi', 'wisata'], dtype=object)

In [115]:
df.head()

Unnamed: 0,label,text,label_num
0,berita,"PATI, KOMPAS.com - Kepala Kepolisian Resor Pat...",0
1,berita,"JAKARTA, KOMPAS.com - Perayaan Natal Bersama P...",0
2,berita,"JAKARTA, KOMPAS.com - Ketua Majelis Permusyawa...",0
3,berita,"SUMEDANG, KOMPAS.com - Jumat (28/12/2018) mala...",0
4,berita,"JAKARTA, KOMPAS.com - Sebanyak 22 staf PT Mass...",0


### remove stopwords & stemming

In [116]:
# import Stemmer Factory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [136]:
print('stemming index : ')
for i in range(300,310):
    print(i,end=', ')
    df.loc[i,'text'] = stemmer.stem(df.loc[i,'text'])

stemming index : 
300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 

In [138]:
df.loc[305:309]

Unnamed: 0,label,text,label_num
305,berita,dongeng bisa rangsang anak untuk suka baca rep...,0
306,berita,darurat sehat rupa isu yang sangat strategis b...,0
307,berita,perlu ada baik sistem informasi bencana serta ...,0
308,berita,mereka tidak ingin tinggal rumah republika co ...,0
309,berita,korban tinggal belum 420 orang republika co id...,0


### split data

In [139]:
# define X and y for use with COUNTVECTORIZER or TfidVectorizer
X = df.text
y = df.label_num
print(X.shape)
print(y.shape)

(2226,)
(2226,)


In [140]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1669,)
(557,)
(1669,)
(557,)


# CountVectorizer

Dengan menggunakan metode CountVectorizer, akurasi yang didapat masing-masing model sebesar = 
    Naive-Bayes         : 85 persen,
    Logistic Regression : 84 persen,
    SVM                 : 76 persen

In [164]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [165]:
# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.fit_transform(X_train)

In [166]:
# examine the document-term matrix
X_train_dtm

<1669x30041 sparse matrix of type '<class 'numpy.int64'>'
	with 275983 stored elements in Compressed Sparse Row format>

In [167]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<557x30041 sparse matrix of type '<class 'numpy.int64'>'
	with 86570 stored elements in Compressed Sparse Row format>

## TfidVectorizer

Dengan menggunakan metode TfidVectorizer, akurasi yang didapat masing-masing model sebesar = 
    Naive-Bayes         : 77 persen,
    Logistic Regression : 83 persen,
    SVM                 : 85 persen

In [141]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
X_train_dtm = vect.fit_transform(X_train)
vect.get_feature_names()

['00',
 '000',
 '000015',
 '000dengan',
 '000mah',
 '001',
 '002',
 '005',
 '008',
 '009',
 '00m',
 '01',
 '010',
 '011',
 '012',
 '016',
 '017',
 '019',
 '01am',
 '02',
 '020',
 '021',
 '022',
 '025',
 '026',
 '027',
 '03',
 '032',
 '033',
 '034',
 '039',
 '04',
 '040',
 '041',
 '048',
 '05',
 '056',
 '06',
 '0601',
 '06172491',
 '062',
 '063',
 '064',
 '066',
 '068',
 '07',
 '072',
 '073',
 '079',
 '08',
 '080',
 '082',
 '083',
 '085372642544',
 '09',
 '091',
 '092',
 '093',
 '096',
 '099',
 '0jadwal',
 '0l',
 '0leicester',
 '0mm',
 '10',
 '100',
 '1000',
 '1000volt',
 '1002hpa',
 '1003',
 '1007',
 '100a',
 '100g',
 '101',
 '1011',
 '1012',
 '1013',
 '101jakarta',
 '102',
 '103',
 '1034',
 '104',
 '105',
 '106',
 '107',
 '10710',
 '108',
 '1080',
 '109',
 '10batang',
 '10mb',
 '10mbps',
 '10mp',
 '10ribu',
 '11',
 '110',
 '1106',
 '110cc',
 '110f',
 '110ribu',
 '111',
 '112',
 '113',
 '113peringkat',
 '114',
 '115',
 '116',
 '118',
 '118peringkat',
 '119',
 '11ac',
 '11th',
 '12',
 '

In [143]:
X_train_dtm

<1669x30041 sparse matrix of type '<class 'numpy.float64'>'
	with 275983 stored elements in Compressed Sparse Row format>

In [144]:
X_test_dtm = vect.transform(X_test)

In [145]:
X_test_dtm

<557x30041 sparse matrix of type '<class 'numpy.float64'>'
	with 86570 stored elements in Compressed Sparse Row format>

# naive-bayes classification

In [168]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [169]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 85 ms, sys: 21.6 ms, total: 107 ms
Wall time: 136 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [170]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [171]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.8473967684021544

In [172]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[36,  5,  0,  0,  2,  1,  5,  0,  7],
       [ 1, 48,  0,  1,  0,  0,  4,  1,  2],
       [ 1,  1, 46,  3,  0,  0,  3,  0,  0],
       [ 0,  0,  1, 55,  0,  0,  1,  0,  4],
       [ 0,  0,  0,  0, 51,  1,  1,  0,  0],
       [ 0,  0,  0,  0,  1, 56,  1,  0,  1],
       [ 0,  0,  3,  0,  0,  0, 58,  0,  4],
       [ 1,  4,  0,  0,  0,  0,  9, 52,  2],
       [ 1,  1,  4,  5,  0,  0,  1,  2, 70]])

In [173]:
# print message text for the false 
X_test[(y_pred_class==8)&(y_test==0)]

720     SEMARANG – Malam pergantian tahun 2018 ke 2019...
1144    Gubernur NTB periode 2018-2023, Zulkieflimansy...
398     Gambar viral di media sosial yang mencantumkan...
305     dongeng bisa rangsang anak untuk suka baca rep...
733     Cuaca buruk yang melanda sejumlah daerah di Ja...
1007    Garut (ANTARA News) - Taman Wisata Alam Gunung...
400     TRIBUNNEWS.COM - Pengacara Hotman Paris Hutape...
Name: text, dtype: object

In [135]:
df.loc[720]

label                                                   berita
text         SEMARANG – Malam pergantian tahun 2018 ke 2019...
label_num                                                    0
Name: 720, dtype: object

# Logistic Regression

In [174]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')

In [175]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)



CPU times: user 8.88 s, sys: 43.6 ms, total: 8.92 s
Wall time: 9.59 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [176]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [177]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.8366247755834829

In [178]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[43,  5,  1,  1,  3,  1,  0,  1,  1],
       [ 3, 48,  2,  0,  0,  0,  2,  1,  1],
       [ 0,  0, 50,  1,  1,  0,  2,  0,  0],
       [ 1,  0,  1, 58,  0,  0,  1,  0,  0],
       [ 3,  0,  1,  0, 47,  2,  0,  0,  0],
       [ 1,  0,  0,  0,  1, 57,  0,  0,  0],
       [ 4,  0,  9,  0,  0,  0, 46,  5,  1],
       [ 3,  3,  1,  0,  0,  1,  2, 55,  3],
       [ 4,  1,  6,  4,  1,  2,  2,  2, 62]])

In [179]:
# print message text for the false 
X_test[(y_pred_class==1)&(y_test==0)]

1144    Gubernur NTB periode 2018-2023, Zulkieflimansy...
305     dongeng bisa rangsang anak untuk suka baca rep...
307     perlu ada baik sistem informasi bencana serta ...
1143    Gubernur Bali, Wayan Koster menyerahkan langsu...
712     JAKARTA – Polda Metro Jaya akan menambah jam o...
Name: text, dtype: object

# SVM

In [180]:
from sklearn import svm
from sklearn.metrics import accuracy_score
clf = svm.SVC(kernel='linear')

In [181]:
%time clf.fit(X_train_dtm, y_train)

CPU times: user 9.29 s, sys: 111 ms, total: 9.4 s
Wall time: 10.1 s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [182]:
print("Predicting...")
prediction = clf.predict(X_test_dtm)

Predicting...


In [183]:
print("Prediction:",prediction)
accuracy = accuracy_score(y_test, prediction)

Prediction: [4 8 6 2 8 1 8 1 8 2 3 3 8 6 8 7 2 4 6 6 4 0 4 4 5 0 8 6 8 8 6 0 7 6 4 4 3
 8 1 3 6 3 0 1 5 2 1 5 3 6 7 2 4 3 4 8 5 7 3 2 6 0 3 3 4 8 7 2 4 6 4 4 7 5
 3 2 8 4 2 3 5 0 3 0 8 3 6 2 3 1 0 8 0 8 5 1 7 5 1 0 8 3 4 2 4 2 3 0 0 6 2
 5 6 8 6 5 4 7 2 1 7 8 6 2 6 2 5 2 7 3 3 0 0 2 0 3 0 3 2 0 7 5 1 0 8 4 0 2
 6 6 3 4 8 7 3 5 7 4 4 6 6 3 7 3 1 2 4 0 5 5 6 3 1 5 7 5 8 8 8 1 6 1 7 0 0
 6 7 4 4 0 7 6 5 0 2 3 3 5 4 4 3 1 6 8 0 3 2 2 2 3 7 8 7 8 1 3 7 5 3 6 6 4
 4 8 1 6 6 5 0 0 5 0 8 5 8 0 3 5 0 4 6 2 6 2 8 7 8 2 0 0 6 7 0 5 1 5 1 5 8
 0 1 4 7 2 6 0 0 1 5 3 1 0 5 6 0 4 0 5 2 1 8 2 6 8 0 2 0 8 2 5 7 5 4 1 6 7
 2 2 3 1 3 2 5 6 3 4 6 8 8 1 1 5 8 0 1 6 2 0 0 7 7 0 1 8 2 1 1 5 1 5 3 4 7
 2 0 5 2 7 3 3 7 1 3 8 4 7 3 3 6 5 2 3 5 0 8 1 3 3 4 8 7 3 6 2 8 7 5 6 8 6
 3 4 1 2 1 3 8 6 0 5 0 2 2 6 7 2 8 3 5 4 6 0 7 5 2 0 4 8 8 5 5 1 5 7 6 4 0
 1 3 2 5 3 0 5 0 2 1 2 6 0 5 5 8 6 8 1 2 7 7 8 8 3 6 3 1 5 7 2 8 3 3 8 8 6
 1 1 4 7 3 4 0 0 0 5 4 2 8 2 0 7 8 8 5 3 0 6 8 8 7 5 5 0 1 7 2 6 4 2 6 1 2
 4 4 5 3 8 8 

In [184]:
metrics.accuracy_score(y_test, y_pred_class)
print('Accuracy:', accuracy)

Accuracy: 0.7612208258527827


## Testing the model

In [42]:
LE.classes_

array(['berita', 'bisnis', 'kesehatan', 'kuliner', 'olahraga', 'otomotif',
       'sains', 'teknologi', 'wisata'], dtype=object)

In [29]:
from requests_html import HTMLSession

session = HTMLSession()

In [43]:
url = 'https://sport.detik.com/sepakbola/liga-inggris/d-4365233/klopp-apa-bedanya-tandang-ke-city-musim-ini-dengan-musim-lalu'
r = session.get(url)

In [44]:
text = r.html.find('div#detikdetailtext',first=True).text

In [45]:
text

'Jakarta - Musim lalu Liverpool dihajar Manchester City di Etihad Stadium. Apakah kali ini situasinya berbeda dengan Liverpool kukuh di puncak klasemen dan belum kalah?\n\nLiverpool akan melawat ke Etihad, Jumat (4/1/2019) dinihari WIB. Laga ini jadi kesempatan besar \'Si Merah\' untuk membuka jarak lebih lebar dengan City di klasemen.\n\n\n\nBaca juga: Tak Ada Waktu untuk Liverpool Besar Kepala\n\n\nAnak-anak Merseyside saat ini memuncaki Premier League dengan 54 poin dari 20 laga, tujuh poin dari City di posisi dua.\n\nPertemuan yang sama musim lalu menghasilkan kekalahan telak untuk Liverpool. Skuat besutan Juergen Klopp dihajar 0-5 oleh City, di mana kala itu Liverpool kehilangan Sadio Mane karena kartu merah di pertengahan babak kedua.\n\nJuergen Klopp pun ditanya apakah ada yang berbeda dengan pertandingan kali ini, mengingat timnya datang dengan catatan mengesankan. Liverpool seperti diketahui belum terkalahkan di 20 laga (17 menang, 3 imbang) dan bisa tampil lebih lepas karena 

In [46]:
test1 = [text]
new_article_vect = vect.transform(test1)
nb.predict(new_article_vect)

array([4])

# saving to csv file

In [185]:
df.to_csv('stemmed_berita_text.csv')