In [8]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import random
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.classify import NaiveBayesClassifier, accuracy
import string

In [9]:
def preprocess_data(text):
    text = str(text).lower()
    word_list = word_tokenize(text)
    
    ind_stopwords = stopwords.words('indonesian')
    word_list = [word for word in word_list if word not in ind_stopwords]

    punctuation_list = string.punctuation
    word_list = [word for word in word_list if word not in punctuation_list]

    word_list = [word for word in word_list if word.isalpha()]

    wnl = WordNetLemmatizer()
    word_list = [wnl.lemmatize(word) for word in word_list]

    return word_list

In [10]:
def extract_features(document):
    all_words = []
    for text in document:
        clean_word_list = preprocess_data(text)
        all_words.extend(clean_word_list)
    
    fd = FreqDist(all_words)
    common_words = [word for word, count in fd.most_common(500)]
    common_words = list(set(common_words))
    return common_words

In [11]:
def extract_dataset():
    PATH = "dataset.csv"
    dataset = pd.read_csv(PATH)

    missing = dataset[dataset['sentimen'].isnull()].index
    dataset.loc[missing, 'sentimen'] = 2

    word_dictionary = extract_features(dataset['Tweet'])

    document = []
    for index, data in dataset.iterrows():
        features = {}
        tweet = preprocess_data(data['Tweet'])
        for feature in word_dictionary:
            key = feature
            value = feature in tweet
            features[key] = value
        
        label = 'negatif' if data['sentimen'] >= 2 else 'positif'

        document.append((features, label))
    
    return document

In [12]:
def train_data(document):
    random.shuffle(document)
    training_amount = int(len(document)*0.7)
    training_data = document[:training_amount]
    testing_data = document[training_amount:]

    classifier = NaiveBayesClassifier.train(training_data)
    classifier.show_most_informative_features(5)
    print(f"Training Accuracy : {round(accuracy(classifier, testing_data)*100, 2)}%")

In [15]:
dataset = extract_dataset()

In [16]:
classifier = train_data(dataset)

Most Informative Features
                   banci = True           negati : positi =     57.8 : 1.0
                    bego = True           negati : positi =     49.6 : 1.0
                brengsek = True           negati : positi =     48.4 : 1.0
                   anjir = True           negati : positi =     36.1 : 1.0
                giveaway = True           positi : negati =     26.5 : 1.0
Training Accuracy : 75.73%
