In [1]:
import pandas as pd
import re
import numpy as np
import joblib
import pickle
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('dataset_preprocessed_tweet.csv')
df

Unnamed: 0,Label,Tweet,Tokenization,Stop_Removal,Tweet_Stemmed
0,Non_HS,fadli zon minta mendagri segera menonaktifkan ...,"['fadli', 'zon', 'minta', 'mendagri', 'segera'...","['fadli', 'zon', 'mendagri', 'menonaktifkan', ...","['fadli', 'zon', 'mendagri', 'nonaktif', 'ahok..."
1,Non_HS,mereka terus melukai aksi dalam rangka memenja...,"['mereka', 'terus', 'melukai', 'aksi', 'dalam'...","['melukai', 'aksi', 'rangka', 'memenjarakan', ...","['luka', 'aksi', 'rangka', 'penjara', 'ahok', ..."
2,Non_HS,sylvi bagaimana gurbernur melakukan kekerasan ...,"['sylvi', 'bagaimana', 'gurbernur', 'melakukan...","['sylvi', 'gurbernur', 'kekerasan', 'perempuan...","['sylvi', 'gurbernur', 'keras', 'perempuan', '..."
3,Non_HS,ahmad dhani tak puas debat pilkada masalah jal...,"['ahmad', 'dhani', 'tak', 'puas', 'debat', 'pi...","['ahmad', 'dhani', 'puas', 'debat', 'pilkada',...","['ahmad', 'dhani', 'puas', 'debat', 'pilkada',..."
4,Non_HS,waspada ktp palsu kawal pilkada https oooer...,"['waspada', 'ktp', 'palsu', 'kawal', 'pilkada'...","['waspada', 'ktp', 'palsu', 'kawal', 'pilkada'...","['waspada', 'ktp', 'palsu', 'kawal', 'pilkada'..."
...,...,...,...,...,...
482,HS,muka babi ahok tuh mirip serbet lantai btp,"['muka', 'babi', 'ahok', 'tuh', 'mirip', 'serb...","['muka', 'babi', 'ahok', 'tuh', 'serbet', 'lan...","['muka', 'babi', 'ahok', 'tuh', 'serbet', 'lan..."
483,HS,betul bang hancurkan merka bang musnahkan chin...,"['betul', 'bang', 'hancurkan', 'merka', 'bang'...","['bang', 'hancurkan', 'merka', 'bang', 'musnah...","['bang', 'hancur', 'merka', 'bang', 'musnah', ..."
484,HS,sapa bilang ahok anti korupsi klo grombolannya...,"['sapa', 'bilang', 'ahok', 'anti', 'korupsi', ...","['sapa', 'bilang', 'ahok', 'anti', 'korupsi', ...","['sapa', 'bilang', 'ahok', 'anti', 'korupsi', ..."
485,HS,juga ngimpi sentilin biji babi ahok pcetar pce...,"['juga', 'ngimpi', 'sentilin', 'biji', 'babi',...","['ngimpi', 'sentilin', 'biji', 'babi', 'ahok',...","['ngimpi', 'sentilin', 'biji', 'babi', 'ahok',..."


In [3]:
def convert(polarity):
	if polarity == "HS":
		return 1
	else:
		return 0

In [4]:
df['Polarity'] = df['Label'].apply(convert)

In [5]:
x = df['Tweet_Stemmed']
y = df['Polarity']

In [6]:
#vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(df['isi'])

bow_transformer = CountVectorizer()
print(df['Tweet_Stemmed'].shape)
X = bow_transformer.fit_transform(df['Tweet_Stemmed'])

print(X.toarray())
print('Shape of Sparse Matrix: ', X.shape)
print('Amount of Non-Zero Occurrences: ', X.nnz)

#TFIDF Transform
tf_transform = TfidfTransformer(use_idf=False).fit(X)
X = tf_transform.transform(X)
print(X.shape)

(487,)
[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Shape of Sparse Matrix:  (487, 1689)
Amount of Non-Zero Occurrences:  4039
(487, 1689)


In [7]:
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
print('Density: {}'.format((density)))

Density: 0.4910381584914102


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.3)
print(X_test)

  (0, 4)	0.35355339059327373
  (0, 6)	0.35355339059327373
  (0, 358)	0.35355339059327373
  (0, 1153)	0.35355339059327373
  (0, 1237)	0.35355339059327373
  (0, 1338)	0.35355339059327373
  (0, 1433)	0.35355339059327373
  (0, 1600)	0.35355339059327373
  (1, 31)	0.2773500981126146
  (1, 92)	0.2773500981126146
  (1, 332)	0.2773500981126146
  (1, 582)	0.2773500981126146
  (1, 622)	0.2773500981126146
  (1, 638)	0.2773500981126146
  (1, 644)	0.2773500981126146
  (1, 700)	0.2773500981126146
  (1, 793)	0.2773500981126146
  (1, 856)	0.2773500981126146
  (1, 972)	0.2773500981126146
  (1, 1257)	0.2773500981126146
  (1, 1615)	0.2773500981126146
  (2, 565)	0.31622776601683794
  (2, 624)	0.31622776601683794
  (2, 717)	0.31622776601683794
  (2, 742)	0.6324555320336759
  :	:
  (144, 306)	0.22360679774997896
  (144, 534)	0.22360679774997896
  (144, 700)	0.4472135954999579
  (144, 1209)	0.22360679774997896
  (144, 1267)	0.22360679774997896
  (144, 1442)	0.22360679774997896
  (144, 1531)	0.2236067977499789

In [9]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
preds = nb.predict(X_test)

In [10]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.81      0.71      0.76        77
           1       0.72      0.81      0.77        70

    accuracy                           0.76       147
   macro avg       0.77      0.76      0.76       147
weighted avg       0.77      0.76      0.76       147



In [11]:
#save classification
from io import StringIO
classification = classification_report(y_test, preds)
s = StringIO(classification)
with open('classification_nb_tweet.csv', 'w') as f:
	for line in s:
		f.write(line)

In [12]:
print(accuracy_score(y_test, preds))

0.7619047619047619


In [13]:
#save hasil accuracy
accuracy = accuracy_score(y_test, preds)
a = np.asarray([accuracy])
np.savetxt("accuracy_nb_tweet.csv", a, delimiter=",", fmt='%s')

In [14]:
confusion_matrix(y_test, preds)

array([[55, 22],
       [13, 57]], dtype=int64)