In [1]:
import pandas as pd
import re
import numpy as np
import joblib
import pickle
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('dataset_preprocessed_tweet.csv')
df

Unnamed: 0,Label,Tweet,Tokenization,Stop_Removal,Tweet_Stemmed
0,Non_HS,fadli zon minta mendagri segera menonaktifkan ...,"['fadli', 'zon', 'minta', 'mendagri', 'segera'...","['fadli', 'zon', 'mendagri', 'menonaktifkan', ...","['fadli', 'zon', 'mendagri', 'nonaktif', 'ahok..."
1,Non_HS,mereka terus melukai aksi dalam rangka memenja...,"['mereka', 'terus', 'melukai', 'aksi', 'dalam'...","['melukai', 'aksi', 'rangka', 'memenjarakan', ...","['luka', 'aksi', 'rangka', 'penjara', 'ahok', ..."
2,Non_HS,sylvi bagaimana gurbernur melakukan kekerasan ...,"['sylvi', 'bagaimana', 'gurbernur', 'melakukan...","['sylvi', 'gurbernur', 'kekerasan', 'perempuan...","['sylvi', 'gurbernur', 'keras', 'perempuan', '..."
3,Non_HS,ahmad dhani tak puas debat pilkada masalah jal...,"['ahmad', 'dhani', 'tak', 'puas', 'debat', 'pi...","['ahmad', 'dhani', 'puas', 'debat', 'pilkada',...","['ahmad', 'dhani', 'puas', 'debat', 'pilkada',..."
4,Non_HS,waspada ktp palsu kawal pilkada https oooer...,"['waspada', 'ktp', 'palsu', 'kawal', 'pilkada'...","['waspada', 'ktp', 'palsu', 'kawal', 'pilkada'...","['waspada', 'ktp', 'palsu', 'kawal', 'pilkada'..."
...,...,...,...,...,...
482,HS,muka babi ahok tuh mirip serbet lantai btp,"['muka', 'babi', 'ahok', 'tuh', 'mirip', 'serb...","['muka', 'babi', 'ahok', 'tuh', 'serbet', 'lan...","['muka', 'babi', 'ahok', 'tuh', 'serbet', 'lan..."
483,HS,betul bang hancurkan merka bang musnahkan chin...,"['betul', 'bang', 'hancurkan', 'merka', 'bang'...","['bang', 'hancurkan', 'merka', 'bang', 'musnah...","['bang', 'hancur', 'merka', 'bang', 'musnah', ..."
484,HS,sapa bilang ahok anti korupsi klo grombolannya...,"['sapa', 'bilang', 'ahok', 'anti', 'korupsi', ...","['sapa', 'bilang', 'ahok', 'anti', 'korupsi', ...","['sapa', 'bilang', 'ahok', 'anti', 'korupsi', ..."
485,HS,juga ngimpi sentilin biji babi ahok pcetar pce...,"['juga', 'ngimpi', 'sentilin', 'biji', 'babi',...","['ngimpi', 'sentilin', 'biji', 'babi', 'ahok',...","['ngimpi', 'sentilin', 'biji', 'babi', 'ahok',..."


In [3]:
def convert(polarity):
	if polarity == "HS":
		return 1
	else:
		return 0

In [4]:
df['Polarity'] = df['Label'].apply(convert)

In [5]:
x = df['Tweet_Stemmed']
y = df['Polarity']

In [6]:
#vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(df['isi'])

bow_transformer = CountVectorizer()
print(df['Tweet_Stemmed'].shape)
X = bow_transformer.fit_transform(df['Tweet_Stemmed'])

print(X.toarray())
print('Shape of Sparse Matrix: ', X.shape)
print('Amount of Non-Zero Occurrences: ', X.nnz)

#TFIDF Transform
tf_transform = TfidfTransformer(use_idf=False).fit(X)
X = tf_transform.transform(X)
print(X.shape)

(487,)
[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Shape of Sparse Matrix:  (487, 1689)
Amount of Non-Zero Occurrences:  4039
(487, 1689)


In [7]:
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
print('Density: {}'.format((density)))

Density: 0.4910381584914102


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.3)
print(X_test)

  (0, 19)	0.30151134457776363
  (0, 446)	0.30151134457776363
  (0, 464)	0.30151134457776363
  (0, 529)	0.30151134457776363
  (0, 579)	0.30151134457776363
  (0, 650)	0.30151134457776363
  (0, 757)	0.30151134457776363
  (0, 958)	0.30151134457776363
  (0, 1077)	0.30151134457776363
  (0, 1256)	0.30151134457776363
  (0, 1448)	0.30151134457776363
  (1, 90)	0.2773500981126146
  (1, 109)	0.2773500981126146
  (1, 190)	0.5547001962252291
  (1, 668)	0.2773500981126146
  (1, 773)	0.2773500981126146
  (1, 839)	0.2773500981126146
  (1, 857)	0.2773500981126146
  (1, 1124)	0.2773500981126146
  (1, 1336)	0.2773500981126146
  (1, 1683)	0.2773500981126146
  (2, 31)	0.25
  (2, 306)	0.25
  (2, 497)	0.25
  (2, 700)	0.5
  :	:
  (143, 31)	0.3333333333333333
  (143, 81)	0.3333333333333333
  (143, 125)	0.3333333333333333
  (143, 245)	0.3333333333333333
  (143, 631)	0.3333333333333333
  (143, 772)	0.3333333333333333
  (143, 995)	0.3333333333333333
  (143, 1098)	0.3333333333333333
  (143, 1566)	0.3333333333333333

In [9]:
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [10]:
y_pred = classifier.predict(X_test)

In [11]:
# Membuat confusion matrix
confusion_matrix(y_test, y_pred)

array([[63, 11],
       [17, 56]], dtype=int64)

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82        74
           1       0.84      0.77      0.80        73

    accuracy                           0.81       147
   macro avg       0.81      0.81      0.81       147
weighted avg       0.81      0.81      0.81       147



In [13]:
#save classification
from io import StringIO
classification = classification_report(y_test, y_pred)
s = StringIO(classification)
with open('classification_svm_tweet.csv', 'w') as f:
	for line in s:
		f.write(line)

In [14]:
print(accuracy_score(y_test, y_pred))

0.8095238095238095


In [15]:
#save hasil accuracy
accuracy = accuracy_score(y_test, y_pred)
a = np.asarray([accuracy])
np.savetxt("accuracy_svm_tweet.csv", a, delimiter=",", fmt='%s')