In [1]:
import pandas as pd
import re
import numpy as np
import joblib
import pickle
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('coba.csv')

In [3]:
#konversi label ke polaritas
def convert(polarity):
    if polarity == 'positif':
        return 1
    elif polarity == 'netral':
        return 0
    else:
        return -1

In [4]:
df['Polarity'] = df['Label'].apply(convert)

In [5]:
X = df['STOP_REMOVAL']
y = df['Polarity']

In [6]:
df

Unnamed: 0,STOP_REMOVAL,Label,Polarity
0,cnnindonesia aneh harga bbm tgl 3 09 kagetnya...,negatif,-1
1,ujung microfon berbunyi rakyat cengeng ujung m...,negatif,-1
2,cnnindonesia sih gak pake sistem kupon aja yg...,negatif,-1
3,politkus irwan pd adian pdip bersiteru kenaika...,netral,0
4,vacum cleaner han river 179rb aja lupa gratis ...,netral,0
...,...,...,...
1662,jisundae idih mahalan pertamax 1 liter,negatif,-1
1663,prfmnews ku dah pakai puas banget dah tuh viv...,positif,1
1664,patraniagarjbt jawabannya c pertamax turbo se...,netral,0
1665,jokowi seliter pertamax 20kg tomat kubis 3 5k...,negatif,-1


In [7]:
import os

import pandas as pd
from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize
from gensim.models import FastText

In [8]:
sentences = [word_tokenize(STOP_REMOVAL.lower()) for STOP_REMOVAL in tqdm(df.STOP_REMOVAL)]
sentences[:5]

  0%|          | 0/1667 [00:00<?, ?it/s]

[['cnnindonesia',
  'aneh',
  'harga',
  'bbm',
  'tgl',
  '3',
  '09',
  'kagetnya',
  'kagetnya',
  'kena',
  'heart',
  'attack'],
 ['ujung',
  'microfon',
  'berbunyi',
  'rakyat',
  'cengeng',
  'ujung',
  'mulut',
  'pejabat',
  'berbunyi',
  'postur',
  'apbn',
  'tersandra',
  'subsidi',
  'memberatkan',
  'keuangan',
  'pemerintah',
  'keputusan',
  'bbm',
  'dihindari',
  'hayo',
  'yg',
  'cengeng'],
 ['cnnindonesia',
  'sih',
  'gak',
  'pake',
  'sistem',
  'kupon',
  'aja',
  'yg',
  'dikasihkan',
  'bayar',
  'pajak',
  'kendaraan',
  'kupon',
  'dikasih',
  'brp',
  'liter',
  'jatah',
  'bbm',
  'disobek',
  'operator',
  'ngisi',
  'spbu',
  'liter',
  'bbm',
  'yg',
  'diisi',
  'pertamina'],
 ['politkus',
  'irwan',
  'pd',
  'adian',
  'pdip',
  'bersiteru',
  'kenaikan',
  'harga',
  'bbm',
  'https',
  't',
  'co',
  'qwbpf4oigk'],
 ['vacum',
  'cleaner',
  'han',
  'river',
  '179rb',
  'aja',
  'lupa',
  'gratis',
  'ongkir',
  '0',
  'rupiah',
  'cashback',
  

# Train FastText Model

In [23]:
model = FastText(sentences, vector_size=128, window=5, min_count=3, workers=4, epochs=1000, sg=0, hs=0)

In [24]:
model.save("coba.fasttext")

# Load

In [25]:
model = FastText.load("coba.fasttext")

# continue training

In [26]:
contoh_data = [
    ['bener', 'yak', 'pertalite', 'berasa', 'boros', 'banget'],
    ["perwarna", "mempengaruhi", "kualitas", "pertalite"],
    ["pertamina", "jamin", "kualitas", "pertalite", "sesuai", "aturan" , "pemerintah", "semoga", "berjalan", "sesuai", "harapan"]
]

In [27]:
model.train(contoh_data, total_examples=len(contoh_data), epochs=1)

(17, 21)

In [28]:
model.save("coba.fasttext")

In [29]:
w2v = model.wv

In [31]:
w2v.index_to_key

['pertalite',
 'bbm',
 't',
 'https',
 'co',
 'pertamax',
 'harga',
 'yg',
 'kenaikan',
 'subsidi',
 'pertamina',
 'rakyat',
 'beli',
 'ya',
 'pemerintah',
 'isi',
 'aja',
 'ga',
 'solar',
 'turun',
 'bensin',
 'tarif',
 'kualitas',
 'gak',
 'mobil',
 'pake',
 'minyak',
 'nya',
 'kalo',
 'rp',
 'liter',
 'motor',
 'masyarakat',
 'orang',
 'boros',
 'akap',
 '1',
 '2',
 'jokowi',
 'jenis',
 'udah',
 'sesuai',
 '000',
 'spbu',
 'ngisi',
 'penyesuaian',
 'serba',
 'indonesia',
 'tolak',
 'mmc',
 'bersubsidi',
 'serbi',
 'bahan',
 'full',
 'demo',
 'mahal',
 'dasar',
 'kota',
 'ron',
 'dunia',
 'pakai',
 'aturan',
 'si',
 'rt',
 'tp',
 'kaya',
 'amp',
 'bakar',
 'antri',
 'subsiditepatsasaran',
 'jamin',
 'salah',
 'tau',
 '3',
 'nih',
 'angkutan',
 'bltbbmtepatsasaran',
 'bantuanbbmuntukrakyat',
 'negara',
 'menaikkan',
 'islamkaffah',
 'turbo',
 'mahasewa',
 'bohir',
 'klo',
 'lu',
 'kawal',
 '10',
 'sih',
 'mengerek',
 'premium',
 'mahasiswakawalsubsidi',
 'blt',
 'provinsi',
 'cnnindon

In [32]:
w2v.vectors

array([[-2.4447005 ,  0.38866603,  0.37672967, ...,  1.2867795 ,
         0.81431764, -1.4684907 ],
       [ 0.27335292,  3.6972387 , -1.2054373 , ...,  1.9560343 ,
        -0.23859046, -4.0134025 ],
       [-0.8764195 , -3.610547  ,  0.20735052, ...,  0.9602518 ,
        -6.5531483 ,  4.72069   ],
       ...,
       [-0.47349688, -0.25101486, -1.5101092 , ..., -0.97903115,
         1.7281873 , -0.561441  ],
       [ 0.91303647, -2.950142  ,  0.0106851 , ..., -0.56582695,
        -0.4128766 , -0.9426125 ],
       [-1.505813  ,  0.16642942, -2.9218552 , ..., -2.74374   ,
         0.46773234,  2.734831  ]], dtype=float32)

In [33]:
w2v.vector_size

128

In [44]:
w2v['bors']

array([-0.32934317,  0.13194421,  1.7841164 ,  2.8712351 ,  0.85983485,
        0.08173323,  1.0975446 , -1.1098926 , -0.3323525 , -1.4257627 ,
       -0.30924496, -0.7620231 ,  0.643226  ,  0.11476453,  0.02479601,
        0.6762807 , -0.26347083, -0.4221569 , -0.26263303, -0.78151846,
       -0.05224352, -0.16127706, -0.05865769,  0.4044252 ,  1.1383505 ,
       -0.39765102, -1.1174088 , -0.7832413 ,  0.92423713, -1.27686   ,
       -0.17428043, -0.535459  ,  0.15945543,  1.1996989 , -1.5231984 ,
       -0.02527368, -0.036335  , -1.4201224 , -1.6868198 ,  0.52794284,
        0.49762496,  0.6203728 ,  0.37035173, -0.82029915, -0.08250193,
        0.01071899, -0.35286403, -1.0417243 ,  0.88634187, -0.24714045,
        0.08475471, -0.2394288 ,  0.11060214, -0.30249366, -0.45246968,
       -0.33020902, -0.58950585, -0.29829127,  0.7140607 , -1.2824175 ,
       -0.37304568, -0.5985968 , -1.4454706 ,  0.14881273,  0.7175496 ,
        0.38913494,  0.9395324 , -0.32815695,  0.7425713 ,  0.30

# similiar word

In [1]:
w2v.similar_by_word("bors", topn=5)

NameError: name 'w2v' is not defined

# higher order visualitation

In [58]:
from umap import UMAP
import numpy as np
import pandas as pd
import plotly.express as px

In [59]:
X = UMAP().fit_transform(w2v.vectors)

In [62]:
df = pd.DataFrame(X, columns=["umap1", "umap2"])
df["text"] = w2v.index_to_key

In [64]:
fig = px.scatter(df, x="umap1", y="umap2", text="text")
fig.update_traces(textposition='top center')
fig.update_layout(
    height=800,
    title_text='Reduced FastText Visualisation'
)
fig.show()