## Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Dataset

In [3]:
data = pd.read_csv('data_ulasan.csv')
data.head()

Unnamed: 0,Ulasan,Label
0,Pesanan gx sesuai dgn yg d gambar,0
1,Pengiriman dan respon super cepat. Tpi barang ...,0
2,barang yg dikirim tidak sesuai pesanan,0
3,"Php, bilang isi ada diseskripsi pas dipesen be...",0
4,barang jelek copot&#34; terus tmpat taro isinya,0


## Preprocessing

Case Folding

In [4]:
import re
def casefolding(Ulasan):
  Ulasan = Ulasan.lower()
  Ulasan = Ulasan.strip()
  Ulasan = re.sub(r'[?|$|.|!|#|&_:)(-+,0-9;]','', Ulasan)
  return Ulasan

In [5]:
# membandingkan before dan after case folding

raw_sample = data['Ulasan'].iloc[0]
case_folding = casefolding(raw_sample)

print('Raw data\t : ',raw_sample)
print('Case Folding\t : ',case_folding)

Raw data	 :  Pesanan gx sesuai dgn yg d gambar
Case Folding	 :  pesanan gx sesuai dgn yg d gambar


Normalisasi

In [6]:
norm = {'tpi' : 'tapi','sdh' : 'sudah', "tmpat" : "tempat", "yg" : "yang", " gue ": " saya ", "bgmn ":'bagaimana', ' tdk':' tidak ', ' blum ':' belum ', 'mantaaaaaaaappp':' bagus ', ' josss ':' bagus ', ' thanks ': ' terima kasih ', 'fast':' cepat ', 'trims':' terima kasih ', 'brg':' barang ', 'gx':' tidak ', ' dgn ':' dengan ', ' recommended':' rekomen ', 'recomend':' rekomen ', 'good':' bagus '}

def normalisasi(str_text):
  for i in norm:
    str_text = str_text.replace(i, norm[i])
  return str_text

In [7]:
# before after normalisasi

raw_sample = data['Ulasan'].iloc[200]
word_normal = normalisasi(case_folding)

print('Raw Data\t : ', raw_sample)
print('Word Normalize\t : ', word_normal)

Raw Data	 :  Barang sesuai dengan pesanan
Word Normalize	 :  pesanan  tidak  sesuai dengan yang d gambar


Stopword

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

filtering = stopwords.words('indonesian')

In [9]:
def remove_stop_word(Ulasan):
  clean_words = []
  Ulasan = Ulasan.split()
  for word in Ulasan:
    if word not in filtering:
      clean_words.append(word)
  return " ".join(clean_words)

In [10]:
# before after stopword

raw_sample = data['Ulasan'].iloc[200]
case_folding = casefolding(raw_sample)
stopword_removal = (case_folding)

print('Raw Data\t : ', raw_sample)
print('Case Folding\t : ',case_folding)
print('Stopword Removal\t : ', stopword_removal)


Raw Data	 :  Barang sesuai dengan pesanan
Case Folding	 :  barang sesuai dengan pesanan
Stopword Removal	 :  barang sesuai dengan pesanan


Stemming

In [11]:
!pip -q install Sastrawi

In [12]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(Ulasan):
  Ulasan = stemmer.stem(Ulasan)
  return Ulasan

In [13]:
raw_sample = data['Ulasan'].iloc[200]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopword_removal)

print('Raw Data\t : ', raw_sample)
print('Case Folding\t : ',case_folding)
print('Stopword Removal\t : ', stopword_removal)
print('Stemming\t : ', text_stemming)

Raw Data	 :  Barang sesuai dengan pesanan
Case Folding	 :  barang sesuai dengan pesanan
Stopword Removal	 :  barang sesuai pesanan
Stemming	 :  barang sesuai pesan


## Text Preprocessing Pipeline

In [14]:
def text_preprocessing_process(Ulasan):
    Ulasan = casefolding(Ulasan)
    Ulasan = normalisasi(Ulasan)
    Ulasan = remove_stop_word(Ulasan)
    Ulasan = stemming(Ulasan)
    return Ulasan

In [15]:
%%time
data['clean_teks'] = data['Ulasan'].apply(text_preprocessing_process)

CPU times: total: 1min 10s
Wall time: 3min 54s


In [16]:
data

Unnamed: 0,Ulasan,Label,clean_teks
0,Pesanan gx sesuai dgn yg d gambar,0,pesan sesuai d gambar
1,Pengiriman dan respon super cepat. Tpi barang ...,0,kirim respon super cepat barang sesuai gambar ...
2,barang yg dikirim tidak sesuai pesanan,0,barang kirim sesuai pesan
3,"Php, bilang isi ada diseskripsi pas dipesen be...",0,php bilang isi skripsi pas sen beda mubajir rp...
4,barang jelek copot&#34; terus tmpat taro isinya,0,barang jelek copot taro isi
...,...,...,...
1879,Terima kasih. ............,1,terima kasih
1880,Bagus dan memuaskan. Packing bagus cepat,1,bagus muas packing bagus cepat
1881,Mantap pengiriman ok cepat recomended,1,mantap kirim ok cepat rekomen ed
1882,Pengiriman cepat. Lumayan,1,kirim cepat lumayan


In [17]:
# simpan data setelah di preprocessing
data.to_csv('clean_data.csv')

## Feature Engineering

In [18]:
# pisahkan kolom feature dan target
x = data['clean_teks']
y = data['Label']

In [19]:
x

0                                   pesan sesuai d gambar
1       kirim respon super cepat barang sesuai gambar ...
2                               barang kirim sesuai pesan
3       php bilang isi skripsi pas sen beda mubajir rp...
4                             barang jelek copot taro isi
                              ...                        
1879                                         terima kasih
1880                       bagus muas packing bagus cepat
1881                     mantap kirim ok cepat rekomen ed
1882                                  kirim cepat lumayan
1883                             bom coba moga oke barang
Name: clean_teks, Length: 1884, dtype: object

In [20]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1879    1
1880    1
1881    1
1882    1
1883    1
Name: Label, Length: 1884, dtype: int64

Feature Extraction (TF-IDF dan N-Gram)

In [21]:
# save model
import pickle

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))

In [23]:
# menampilkan vocabulary dari tf-idf
vec_TF_IDF.vocabulary_

{'pesan': 1883,
 'sesuai': 2252,
 'gambar': 813,
 'kirim': 1206,
 'respon': 2075,
 'super': 2428,
 'cepat': 454,
 'barang': 194,
 'kuat': 1276,
 'tembus': 2500,
 'jok': 1074,
 'motor': 1565,
 'php': 1899,
 'bilang': 292,
 'isi': 1014,
 'skripsi': 2315,
 'pas': 1824,
 'sen': 2231,
 'beda': 244,
 'mubajir': 1577,
 'rp': 2111,
 'ga': 800,
 'cocok': 497,
 'jelek': 1053,
 'copot': 516,
 'taro': 2474,
 'beli': 251,
 'staples': 2388,
 'gak': 810,
 'jual': 1081,
 'pake': 1801,
 'ckckck': 481,
 'ngirim': 1654,
 'barangdiperiksa': 196,
 'gan': 821,
 'kondisi': 1243,
 'layak': 1332,
 'gitu': 867,
 'loh': 1382,
 'karat': 1124,
 'produk': 1957,
 'order': 1764,
 'prohex': 1961,
 'terima': 2516,
 'masang': 1474,
 'cacat': 410,
 'sblum': 2188,
 'coba': 492,
 'kecewa': 1147,
 'dipakey': 655,
 'isiya': 1015,
 'kluar': 1220,
 'buah': 373,
 'papan': 1815,
 'nggak': 1652,
 'nempel': 1622,
 'bagus': 127,
 'pakai': 1798,
 'pos': 1936,
 'hasil': 936,
 'rapat': 1999,
 'bidang': 287,
 'cacathadeeuh': 412,
 'sta

In [24]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))

2773


In [25]:
# melihat fitur apa saja yang ada di dalam corpus
print(vec_TF_IDF.get_feature_names_out())

['abai' 'abieasssssss' 'abiiss' ... 'zipper' 'zoonk' 'zzzzzz']


In [26]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1, columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,abai,abieasssssss,abiiss,abis,abisbukan,abu,acc,acer,action,acu,...,yoiyoiyoiyoiyoiyoiyoyiyoi,you,youuuuuuuuu,yujuan,ywdh,zen,zipp,zipper,zoonk,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Feature Selection

In [27]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [28]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=2000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# untuk reduced features
print('Original Feature Number', x_train.shape[1])
print('Reduce Feature Number', x_kbest_features.shape[1])

Original Feature Number 2773
Reduce Feature Number 2000


In [29]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.301001
1,0.614979
2,0.586327
3,2.027129
4,0.500851
...,...
2768,0.335482
2769,0.964546
2770,0.605671
2771,0.463694


In [30]:
# menampilkan data feature beserta nilainya
feature = vec_TF_IDF.get_feature_names_out()
feature

Data['Fitur'] = feature
Data

Unnamed: 0,Nilai,Fitur
0,0.301001,abai
1,0.614979,abieasssssss
2,0.586327,abiiss
3,2.027129,abis
4,0.500851,abisbukan
...,...,...
2768,0.335482,zen
2769,0.964546,zipp
2770,0.605671,zipper
2771,0.463694,zoonk


In [31]:
# mengurutkan nilai feature terbaik
Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Fitur
454,4.756042e+01,cepat
2516,3.499750e+01,terima
127,2.766197e+01,bagus
1449,2.746733e+01,mantap
1147,2.356187e+01,kecewa
...,...,...
1846,4.378813e-06,peking
788,3.836849e-06,for
1331,1.855439e-06,laris
306,4.238350e-07,blm


In [32]:
mask = chi2_features.get_support()
mask

array([False,  True,  True, ...,  True,  True,  True])

In [33]:
# menampilkan fitur yang terpilih berdasarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi square

new_feature=[]
for bool, f in zip(mask, feature):
    if bool :
        new_feature.append(f)
    selected_feature=new_feature
selected_feature

['abieasssssss',
 'abiiss',
 'abis',
 'abisbukan',
 'abu',
 'acc',
 'acer',
 'ad',
 'ada',
 'adabarang',
 'adahargaadakualitas',
 'adaptor',
 'aduuh',
 'aesuai',
 'aftersale',
 'aga',
 'agan',
 'agk',
 'ahhh',
 'aj',
 'aja',
 'ajayaaa',
 'ajib',
 'ajuin',
 'akal',
 'akibat',
 'akurasi',
 'akurat',
 'ala',
 'alamat',
 'alas',
 'alatny',
 'alhamdhulillah',
 'alhamdulillah',
 'alhamdullilahpesan',
 'alias',
 'allah',
 'allhamdulilillah',
 'also',
 'alus',
 'ama',
 'aman',
 'ambil',
 'amin',
 'amp',
 'ampamp',
 'ampmantapamp',
 'ampuuuun',
 'anak',
 'analaognya',
 'analog',
 'ancur',
 'ancurrrr',
 'and',
 'ane',
 'aneh',
 'antar',
 'antena',
 'anti',
 'apes',
 'apesss',
 'apik',
 'apple',
 'arah',
 'asal',
 'asli',
 'asuksma',
 'asus',
 'atas',
 'atk',
 'ato',
 'atu',
 'audio',
 'auto',
 'awet',
 'baang',
 'babut',
 'baca',
 'backlit',
 'bad',
 'baek',
 'bagai',
 'bagi',
 'bagis',
 'bagua',
 'bagus',
 'bagusampsangat',
 'bagusbanget',
 'bagusbarang',
 'bagusberfungsi',
 'baguscuma',
 'bagu

In [34]:
# membuat vocaulary baru berdasarkan feature yang terseleksi

new_selected_feature = {}

for(k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature

{'pesan': 1883,
 'sesuai': 2252,
 'kirim': 1206,
 'respon': 2075,
 'super': 2428,
 'cepat': 454,
 'barang': 194,
 'jok': 1074,
 'motor': 1565,
 'php': 1899,
 'bilang': 292,
 'isi': 1014,
 'skripsi': 2315,
 'pas': 1824,
 'sen': 2231,
 'beda': 244,
 'mubajir': 1577,
 'rp': 2111,
 'ga': 800,
 'cocok': 497,
 'jelek': 1053,
 'copot': 516,
 'taro': 2474,
 'beli': 251,
 'staples': 2388,
 'gak': 810,
 'jual': 1081,
 'pake': 1801,
 'ckckck': 481,
 'barangdiperiksa': 196,
 'gan': 821,
 'layak': 1332,
 'gitu': 867,
 'loh': 1382,
 'produk': 1957,
 'prohex': 1961,
 'terima': 2516,
 'masang': 1474,
 'cacat': 410,
 'sblum': 2188,
 'coba': 492,
 'kecewa': 1147,
 'dipakey': 655,
 'isiya': 1015,
 'kluar': 1220,
 'buah': 373,
 'papan': 1815,
 'nempel': 1622,
 'bagus': 127,
 'pakai': 1798,
 'pos': 1936,
 'rapat': 1999,
 'bidang': 287,
 'cacathadeeuh': 412,
 'standart': 2384,
 'aja': 30,
 'rusakkecewa': 2126,
 'berat': 260,
 'bs': 368,
 'straples': 2409,
 'nya': 1695,
 'gk': 868,
 'dipake': 653,
 'kali': 1

In [35]:
len(new_selected_feature)

2000

In [36]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav","wb"))

## Modelling (Naive Bayes)

In [37]:
selected_x = x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
# import Library
import random
from sklearn.model_selection import train_test_split

# import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB

In [39]:
x = selected_x
y = data.Label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [40]:
# menampilkan jumlah data train dan testing
print('Banyaknya x_train : ', len(x_train))
print('Banyaknya x_test : ', len(x_test))
print('Banyaknya y_train : ', len(y_train))
print('Banyaknya y_test : ', len(y_test))

Banyaknya x_train :  1507
Banyaknya x_test :  377
Banyaknya y_train :  1507
Banyaknya y_test :  377


In [41]:
# proses training menggunakan naive bayes
text_algorithm = MultinomialNB()

In [42]:
model = text_algorithm.fit(x_train, y_train)

In [43]:
data_input = ("kirim respon super cepat barang sesuai gambar kuat tembus jok motor")
data_input = text_preprocessing_process(data_input)

#load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil == 0):
    s = "Negatif"
elif(hasil == 1):
    s = "Positif"
print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 Positif


## Evaluasi Model

In [44]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)
CM = confusion_matrix(y_test, predicted)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.95      0.75      0.84       197
           1       0.78      0.96      0.86       180

    accuracy                           0.85       377
   macro avg       0.87      0.86      0.85       377
weighted avg       0.87      0.85      0.85       377



In [45]:
# menyimpan model
pickle.dump(model,open("model_fraud.sav","wb"))