## ANALISIS SENTIMEN OPINI PUBLIK TERHADAP PENERAPAN PEMBELAJARAN LURING PADA MASA PANDEMI COVID-19

#### Import Library dan Load Dataset

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
import pytz
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
# from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import re
import pickle

In [41]:
#load Dataset
# data_frame = pd.read_csv('tweets_data.csv')
data_frame = pd.read_csv('tweets_data_clean.csv')
data_frame = data_frame[:5000]
data_frame.head()

Unnamed: 0,id,username,created_at,replies_count,retweets_count,likes_count,text_clean,text_preprocessed,polarity
0,1.33e+18,njmyg_,2020-11-18 23:17:14 SE Asia Standard Time,0,0,0,iya sih tapi maksud sender kuliah online kali,"['iya', 'sih', 'maksud', 'sender', 'kuliah', '...",positive
1,1.33e+18,urbbyyyyyyyy,2020-11-18 23:07:06 SE Asia Standard Time,0,0,0,males kuliah online temennya sikit,"['males', 'kuliah', 'online', 'temennya', 'sik...",negative
2,1.33e+18,risyaanggun,2020-11-18 23:05:21 SE Asia Standard Time,0,0,0,tumbenan td kuliah online dosennya minta join ...,"['tumben', 'td', 'kuliah', 'online', 'dosen', ...",negative
3,1.33e+18,nyctophilexxx,2020-11-18 22:58:58 SE Asia Standard Time,1,0,0,nangis krn kecapean kuliah online,"['nang', 'krn', 'cape', 'kuliah', 'online']",negative
4,1.33e+18,anisanwl,2020-11-18 22:50:53 SE Asia Standard Time,0,0,1,apa hanya aku yang merasa semenjak kuliah onli...,"['semenjak', 'kuliah', 'online', 'kerja', 'ota...",negative


In [42]:
data_frame.shape

(5000, 9)

#### DATA PROCESSING ####

In [43]:
data_frame['polarity'].replace(('neutral', 'positive', 'negative'), (0, 1, 2), inplace=True)
data_frame['polarity'].value_counts()

2    2925
1    2056
0      19
Name: polarity, dtype: int64

In [44]:
data = data_frame['text_preprocessed'].values.tolist()
label = data_frame['polarity'].values.tolist()

#### Split Data Train dan Test (80/20) ####

In [45]:
train_X, test_X, y_train, y_test = train_test_split(data, label, test_size=0.2, shuffle=True)

print(f'Jumlah data training: {len(train_X)}')
print(f'Jumlah data testing: {len(test_X)}')

Jumlah data training: 4000
Jumlah data testing: 1000


#### TEXT PREPROCESSING ####

In [46]:
def cleaningText(text):
    text_clean = re.sub(r'@[A-Za-z0-9]+', '', str(text)) # Hapus Mention
    text_clean = re.sub(r'#[A-Za-z0-9]+', '', text_clean) # Hapus hashtag
    text_clean = re.sub(r'RT[\s]', '', text_clean) # Hapus RT
    text_clean = re.sub(r"http\S+", '', text_clean) # Hapus link
    text_clean = re.sub(r'[0-9]+', '', text_clean) # Hapus angka
    text_clean = text_clean.replace('\n', ' ') # Ganti enter ke spasi
    text_clean = text_clean.translate(str.maketrans('', '', string.punctuation)) # Hapus tanda baca
    text_clean = text_clean.strip(' ') # Hapus spasi tdk jelas
    return text_clean

def casefoldingText(text):
    lwr = text
    map(str.lower, lwr)
    text = lwr
    return text

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
def tokenizingText(text): 
    text = tokenizer.tokenize(text)
                               
    return text

def filteringText(text):
    listStopwords = set(stopwords.words('indonesian'))
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered 
    return text

def stemmingText(text): 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = [stemmer.stem(word) for word in text]
    return text


In [47]:
def process_tweet(tweet):
    tweet_Cleaning = cleaningText(tweet)
    tweet_CaseFolding = casefoldingText(tweet_Cleaning)
    tweet_Tokenizing = tokenizingText(tweet_CaseFolding)
    tweet_Filtering = filteringText(tweet_Tokenizing)
#     tweet_Stemming = stemmingText(tweet_Filtering)    
    return tweet_Filtering

In [48]:

time_jkt = pytz.timezone('Asia/Jakarta')
print("Mulai:", datetime.now(time_jkt).strftime("%H:%M:%S"))

#preprocess data train
preprocessed_text = []
for i in range(0, len(train_X)):
    preprocessed_text.append(process_tweet(train_X[i]))

X_train = preprocessed_text

# Pkl_Filename = "X_train.pkl"  

# with open(Pkl_Filename, 'wb') as file:  
#     pickle.dump(X_train, file)


#preprocess data test
preprocessed_text = []
for i in range(0, len(test_X)):
    preprocessed_text.append(process_tweet(test_X[i]))

X_test = preprocessed_text

# Pkl_Filename = "y_train.pkl"  

# with open(Pkl_Filename, 'wb') as file:  
#     pickle.dump(y_train, file)
print("Selesai:", datetime.now(time_jkt).strftime("%H:%M:%S"))

Mulai: 18:19:51
Selesai: 18:19:57


In [None]:
# Pkl_Filename = "X_train.pkl" 
# with open(Pkl_Filename, 'rb') as file:  
#     X_train = pickle.load(file)

In [None]:
# Pkl_Filename = "y_train.pkl" 
# with open(Pkl_Filename, 'rb') as file:  
#     y_train = pickle.load(file)

#### CREATE DICTIONARY ####

In [None]:
def createDictionary(data):
  dictionary = dict()
  for sampel in  data:
    for token in sampel:
      dictionary[token] = dictionary.get(token, 0) + 1
  #sorting dictionary berdasarkan nilainya
  daftar_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
  return dict(daftar_dict)

In [None]:
bow = createDictionary(X_train)

print("Token teratas pada Dictionary:\n")
list(bow.items())[:10]

In [None]:
print(len(bow))

#### NAIVE BAYES CLASSIFIER ####

In [4]:
#Navie Bayes Classifier 
class NBClassifier:

    def __init__(self, X_train, y_train, size):  
      self.X_train = X_train
      self.y_train = y_train
      self.size = size

    def createDictionary(self):
      dictionary = dict()
    
      for sampel in  X_train:
        for token in sampel:
          dictionary[token] = dictionary.get(token, 0) + 1
      daftar_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
      return dict(daftar_dict)
    
    def train(self):
      X_train_dict = self.createDictionary()
      if self.size == 'full':
        self.daftar_kata = list(X_train_dict.keys())
        self.jumlah_kata = dict.fromkeys(self.daftar_kata, None)
        
      else:
        self.daftar_kata = list(X_train_dict.keys())[:int(self.size)]
        self.jumlah_kata = dict.fromkeys(self.daftar_kata, None)
        
        print(daftar_kata)
      train = pd.DataFrame(columns = ['X_train', 'y_train'])
      train['X_train'] = X_train
      train['y_train'] = y_train

      train_0 = train.copy()[train['y_train'] == 0]
      train_1 = train.copy()[train['y_train'] == 1]
      train_2 = train.copy()[train['y_train'] == 2]

      Prior_0 = train_0.shape[0]/train.shape[0] 
      Prior_1 = train_1.shape[0]/train.shape[0]
      Prior_2 = train_2.shape[0]/train.shape[0]
      
      self.Prior = np.array([Prior_0, Prior_1, Prior_2])
        
      def flat(listOfList):
        jadi = []
        for elemen in listOfList:
         jadi.extend(elemen)
        return jadi
  
      X_train_0 = flat(train[train['y_train'] == 0]['X_train'].tolist())
      X_train_1 = flat(train[train['y_train'] == 1]['X_train'].tolist())
      X_train_2 = flat(train[train['y_train'] == 2]['X_train'].tolist())
      
      self.X_train_len = np.array([len(X_train_0), len(X_train_1), len(X_train_2)])
    # Bag of word
      for token in self.daftar_kata:
        res = []
        res.insert(0, X_train_0.count(token))
        res.insert(1, X_train_1.count(token))
        res.insert(2, X_train_2.count(token))
        self.jumlah_kata[token] = res
      return self

    def predict(self, X_test):     
      pred = []
      for sampel in X_test:
            
        mulai = np.array([1,1,1])
        
        for tokens in sampel:
          jumlah_vocab = len(self.daftar_kata)
          if tokens in self.daftar_kata:
            prob = ((np.array(self.jumlah_kata[tokens])+1) / (self.X_train_len + jumlah_vocab))
          else:
            prob = ((np.array([0,0,0])+1) / (self.X_train_len + jumlah_vocab))
          mulai = mulai * prob
        pos = mulai * self.Prior
        pred.append(np.argmax(pos))
      return pred
    
    def score(self, pred, labels):
      correct = (np.array(pred) == np.array(labels)).sum()
      accuracy = correct/len(pred)
      return correct, accuracy

#### TRAINING DAN TESTING MODEL NB ####

In [2]:
#training classifier     
nb = NBClassifier(X_train, y_train, '5000)  
nb.train()

NameError: name 'X_train' is not defined

In [21]:
Pkl_Filename = "Pickle_NB_Model.pkl" 
with open(Pkl_Filename, 'rb') as file:  
    nb = pickle.load(file)


In [22]:
#predict
y_pred = nb.predict(X_test)

## Hasil Evaluasi ##

In [23]:
from sklearn.metrics import confusion_matrix
confusion=confusion_matrix(y_test, y_pred)
print(confusion)

from sklearn.metrics import accuracy_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Neutral', 'Positive', 'Negative']))

[[  0   3   2]
 [  0 292 109]
 [  1  65 528]]

Accuracy: 0.82


Classification Report

              precision    recall  f1-score   support

     Neutral       0.00      0.00      0.00         5
    Positive       0.81      0.73      0.77       401
    Negative       0.83      0.89      0.86       594

    accuracy                           0.82      1000
   macro avg       0.55      0.54      0.54      1000
weighted avg       0.82      0.82      0.82      1000



In [None]:
#Evaluasi Performa Model NB Manual
cor1, acc1 = nb.score(y_pred, y_test)
print("Prediksi Benar:", cor1)
print("Akurasi: %i / %i = %.4f " %(cor1, len(y_pred), acc1))

## Visualisasi Tabel ##

In [None]:
pd.set_option('display.max_colwidth', 3000)
hasil = pd.DataFrame(list(zip(test_X, X_test, y_pred)),
               columns =['Tweet', 'Tweet_Processed', 'Sentimen'])
hasil['Sentimen'].replace((0, 1, 2), ('neutral', 'positive', 'negative'), inplace=True)

hasil.head()

## Visualisasi Pie Chart ##

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
sizes = [count for count in hasil['Sentimen'].value_counts()]
labels = list(hasil['Sentimen'].value_counts().index)
# explode = (0.1, 0, 0)
ax.pie(x = sizes, labels = labels, autopct = '%1.1f%%', textprops={'fontsize': 14})
ax.set_title('Polaritas Sentimen Data dari Tweet \n\n Jumlah Tweet=10000', fontsize = 16, pad = 20)
plt.show()

## Visualisasi WordCloud ##

In [None]:
# Visualize word cloud

list_words=''
for tweet in hasil['Tweet_Processed']:
    for word in tweet:
        list_words += ' '+(word)
        
wordcloud = WordCloud(width = 600, height = 400, background_color = 'black', min_font_size = 10).generate(list_words)
fig, ax = plt.subplots(figsize = (8, 6))
ax.set_title('Word Cloud Data Tweet', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()

# Visualisasi WordCloud Positif ##

In [None]:
hasil_pos = hasil.loc[hasil['Sentimen']=='positive']

list_words_postive=''
for row_word in hasil_pos['Tweet_Processed']:
    for word in row_word:
        list_words_postive += ' '+(word)
        
wordcloud = WordCloud(width = 800, height = 600, background_color = 'black', colormap = 'Greens'
                               , min_font_size = 10).generate(list_words_postive)
fig, ax = plt.subplots(figsize = (8, 6))
ax.set_title('Word Cloud Data Tweet Positif', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()

# Visualisasi WordCloud Negatif ##

In [None]:
hasil_neg = hasil.loc[hasil['Sentimen']=='negative']

list_words_negative=''
for row_word in hasil_neg['Tweet_Processed']:
    for word in row_word:
        list_words_negative += ' '+(word)
        
wordcloud = WordCloud(width = 800, height = 600, background_color = 'black', colormap = 'Reds'
                               , min_font_size = 10).generate(list_words_negative)
fig, ax = plt.subplots(figsize = (8, 6))
ax.set_title('Word Cloud Data Tweet Negatif', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()

# Visualisasi WordCloud Netral ##

In [None]:
hasil_neu = hasil.loc[hasil['Sentimen']=='neutral']

list_words_neutral=''
for row_word in hasil_neu['Tweet_Processed']:
    for word in row_word:
        list_words_neutral += ' '+(word)
        
wordcloud = WordCloud(width = 800, height = 600, background_color = 'black', colormap = 'Blues_r'
                               , min_font_size = 10).generate(list_words_neutral)
fig, ax = plt.subplots(figsize = (8, 6))
ax.set_title('Word Cloud Data Tweet Netral', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()

In [None]:
X_test_1 = []
lagi = ''

while lagi != 'n':
    tweet_x= input("Input tweet: ")
    lagi= input("Input Lagi (y/n):")

    X_test_1.append(tweet_x)

In [None]:
preprocessed_text = []
for i in range(0, len(X_test_1)):
    preprocessed_text.append(process_tweet(X_test_1[i]))

X_test_2 = preprocessed_text

In [None]:
predicting
y_pred = nb.predict(X_test_2)

In [None]:
hasil_1 = pd.DataFrame(list(zip(X_test_1, X_test_2, y_pred)),
               columns =['Tweet', 'Tweet_Processed', 'Sentimen'])
hasil_1['Sentimen'].replace((0, 1, 2), ('neutral', 'positive', 'negative'), inplace=True)


In [None]:
pd.set_option('display.max_colwidth', 3000)
hasil_1

# Test Data Dari Twitter #

In [27]:
import tweepy as tw

####Credentials
consumer_key='VClbThxnr6T59VVrXtJJ1c7yF'
consumer_secret='SKR1Oxo1MTgn6veEEfuEj76nrTaklhtxYP0mlFjHIneJEKubf1'
access_token='1362975019131760642-5EjLvg3qNvIAVVa5ui9CO27RWHhWTQ'
access_token_secret='8l1NLLiRnd4XIVDtFnoW75GrABPffKB2bR4SgDy5A72VP'

# Authenticate
auth = tw.OAuthHandler(consumer_key, consumer_secret)
# Set Tokens
auth.set_access_token(access_token, access_token_secret)
# Instantiate API
api = tw.API(auth, wait_on_rate_limit=True)

#Download Tweet
KataKunci = input("Masukan Kata Kunci:")
JmlTweets = int(input("Masukan Jumlah Tweets:"))
searched_tweets = [status for status in tw.Cursor(api.search_tweets, q=KataKunci).items(JmlTweets)]
# test_tweet = [tweet.text for tweet in searched_tweets]
print(searched_tweets)

Masukan Kata Kunci:malas
Masukan Jumlah Tweets:10
[Status(_api=<tweepy.api.API object at 0x00000152E6A92CD0>, _json={'created_at': 'Sat Feb 12 16:07:30 +0000 2022', 'id': 1492530814168571906, 'id_str': '1492530814168571906', 'text': 'La nueva versión de rebelde es como rbd pero con 1 par de maricas malas estoy--------', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}, 'metadata': {'iso_language_code': 'es', 'result_type': 'recent'}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1055526270312812545, 'id_str': '1055526270312812545', 'name': 'susana 👹', 'screen_name': 'susanssss_', 'location': '💩', 'description': 'arrastrada x la vida cual cucaracha.\n Fan de Rafaella Carra x supuesto xq es la reina del universo rumo

In [None]:
preprocessed_text = []
for i in range(0, len(test_tweet)):
    preprocessed_text.append(process_tweet(test_tweet[i]))

X_test_tweet = preprocessed_text

In [None]:
print(X_test_tweet)

In [None]:
#predicting

print("Mulai:", datetime.now(time_jkt).strftime("%H:%M:%S"))
y_pred = nb.predict(X_test_tweet)

#Checking

# Visualisasi Tabel ##

In [None]:
hasil_2 = pd.DataFrame(list(zip(test_tweet, X_test_tweet, y_pred)),
               columns =['Tweet', 'Tweet_Processed', 'Sentimen'])
hasil_2['Sentimen'].replace((0, 1, 2), ('neutral', 'positive', 'negative'), inplace=True)

pd.set_option('display.max_colwidth', 3000)
hasil_2

# Visualisasi Pie Chart ##

In [None]:
fig, ax = plt.subplots(figsize = (6, 6))
sizes = [count for count in hasil_2['Sentimen'].value_counts()]
labels = list(hasil_2['Sentimen'].value_counts().index)
# explode = (0.1, 0, 0)
ax.pie(x = sizes, labels = labels, autopct = '%1.1f%%', textprops={'fontsize': 14})
ax.set_title('Polaritas Sentimen Data dari Tweet', fontsize = 16, pad = 20)
plt.show()

In [None]:
# Visualisasi WordCloud ##

In [None]:
# Visualize word cloud

list_words=''
for tweet in hasil_2['Tweet_Processed']:
    for word in tweet:
        list_words += ' '+(word)
        
wordcloud = WordCloud(width = 600, height = 400, background_color = 'black', min_font_size = 10).generate(list_words)
fig, ax = plt.subplots(figsize = (8, 6))
ax.set_title('Word Cloud Data Tweet', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()

## Visualisasi WordCloud Positif ##

In [None]:
hasil_pos_2 = hasil_2.loc[hasil_2['Sentimen']=='positive']

list_words_postive_2=''
for row_word in hasil_pos_2['Tweet_Processed']:
    for word in row_word:
        list_words_postive_2 += ' '+(word)
        
wordcloud = WordCloud(width = 800, height = 600, background_color = 'black', colormap = 'Greens'
                               , min_font_size = 10).generate(list_words_postive_2)
fig, ax = plt.subplots(figsize = (8, 6))
ax.set_title('Word Cloud Data Tweet Positif', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()

# Visualisasi WordCloud Negatif ##

In [None]:
hasil_neg_2 = hasil_2.loc[hasil_2['Sentimen']=='negative']

list_words_negative_2=''
for row_word in hasil_neg_2['Tweet_Processed']:
    for word in row_word:
        list_words_negative_2 += ' '+(word)
        
wordcloud = WordCloud(width = 800, height = 600, background_color = 'black', colormap = 'Reds'
                               , min_font_size = 10).generate(list_words_negative_2)
fig, ax = plt.subplots(figsize = (8, 6))
ax.set_title('Word Cloud Data Tweet Negatif', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()