# Import Library

In [0]:
import nltk
import pandas as pd
import numpy as np
import math
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Read Data

In [0]:
df = pd.read_csv('dataset.csv',index_col=0)

In [0]:
df = df.sort_values('judul',ignore_index=True)

In [4]:
df.head()

Unnamed: 0,judul,konten,kategori
0,Air Terjun 7 Bidadari yang Tersembunyi di Lahat,,travel
1,Bima dan Pantai Kolo,,travel
2,"Cha-Am, Tempat Anti Mainstream di Thailand",,travel
3,Obat Rindu ke Museum Maritim,"Pada masa lalu, bangsa Indonesia terkenal akan...",travel
4,Rasanya Kangen Touring Naik Motor ke Gunung M...,,travel


In [5]:
df.shape

(1333, 3)

In [6]:
df['kategori'].value_counts()

finance    796
travel     317
food       220
Name: kategori, dtype: int64

# Preprocessing

## Missing Value

In [7]:
df['kategori'].value_counts() #Before Drop

finance    796
travel     317
food       220
Name: kategori, dtype: int64

In [8]:
df.isnull().any(axis=1).sum()

68

In [0]:
df = df.dropna(axis=0).reset_index().drop('index',axis=1)

In [10]:
df['kategori'].value_counts() #After Drop

finance    771
travel     295
food       199
Name: kategori, dtype: int64

## Case Folding

In [0]:
df['judul'] = df['judul'].str.lower()

In [0]:
df['konten'] = df['konten'].str.lower()

## Tokenization

In [0]:
df['konten'] = df.apply(lambda row: ' '.join(RegexpTokenizer(r'\w+').tokenize(row['konten'])), axis=1)

## Stopwords

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
stop_words=set(stopwords.words('indonesian'))

In [16]:
len(stop_words)

757

## Split Data

In [0]:
df_train = df.iloc[:math.ceil(df.shape[0]*0.7) , :]
df_test = df.iloc[math.ceil(df.shape[0]*0.7):, :]

In [18]:
df_train.shape

(886, 3)

In [19]:
df_test.shape

(379, 3)

# Feature Extraction

Count Vectorizer

In [20]:
count_vect = CountVectorizer(stop_words=stop_words,analyzer='word')
X_train_counts = count_vect.fit_transform(df_train['konten'])
X_train_counts.shape

(886, 15400)

TF-IDF Transformer

In [21]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(886, 15400)

# Modelling

## Naive Bayes

In [0]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words,analyzer='word')),
                     ('clf', MultinomialNB())])

In [0]:
text_clf = text_clf.fit(df_train['konten'], df_train['kategori'])

In [0]:
pred = text_clf.predict(df_test['konten'])

In [49]:
accuracy_score(df_test['kategori'],pred)

0.9313984168865436

## Linear SVM

In [0]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=stop_words,analyzer='word')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)),
                         ])

In [0]:
text_clf_svm = text_clf_svm.fit(df_train['konten'], df_train['kategori'])

In [0]:
pred = text_clf_svm.predict(df_test['konten'])

In [29]:
accuracy_score(df_test['kategori'],pred)

0.941952506596306

# Evaluation

## Naive Bayes

In [50]:
scores = cross_val_score(text_clf, df_train['konten'],df_train['kategori'])
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.95 (+/- 0.03)


### Own Sentence Test

In [51]:
text_clf.predict(['jalan jalan ke kota denpasar sangat menyenangkan karena kita dapat mengunjungi berbagai tempat wisata'])

array(['travel'], dtype='<U7')

In [52]:
text_clf.predict(['penurunan mata uang rupiah terjadi karena banyaknya investor asing yang menarik uangnya keluar indonesia'])

array(['finance'], dtype='<U7')

In [55]:
text_clf.predict(['saya menikmati kuliner jajanan dan berbagai jenis masakan khas yogyakarta'])

array(['food'], dtype='<U7')

## Linear SVM

In [34]:
scores = cross_val_score(text_clf_svm, df_train['konten'],df_train['kategori'])
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.94 (+/- 0.03)


### Own Sentence Test

In [35]:
text_clf_svm.predict(['jalan jalan ke kota denpasar sangat menyenangkan karena kita dapat mengunjungi berbagai tempat wisata'])

array(['travel'], dtype='<U7')

In [36]:
text_clf_svm.predict(['penurunan mata uang rupiah terjadi karena banyaknya investor asing yang menarik uangnya keluar indonesia'])

array(['finance'], dtype='<U7')

In [37]:
text_clf_svm.predict(['saya menikmati kuliner jajanan dan berbagai jenis masakan khas yogyakarta'])

array(['food'], dtype='<U7')