In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import plotly.graph_objs as go
import plotly.offline as py

In [2]:
df = pd.read_csv('data/Labeling_data.csv', sep=',')
#print (df)
print(df.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Tanggal&Waktu', 'Username', 'Clean_text',
       'Tweet_tokenization', 'Stop_removal', 'Steaming', 'Positive',
       'Negative', 'Neutral', 'Compound', 'Sentiment'],
      dtype='object')


In [7]:
#Polaritas Label
def convert(polarity):
    if polarity == 'Positive':
        return 1
    elif polarity == 'Netral':
        return 0
    else:
        return -1

df['Polarity'] = df['Sentiment'].apply(convert)

In [8]:
df.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tanggal&Waktu,Username,Clean_text,Tweet_tokenization,Stop_removal,Steaming,Positive,Negative,Neutral,Compound,Sentiment,Polarity
0,0,0,2022-06-14 08:34:52+00:00,iammasterblade,wl spots giveaway like and fo...,"['', 'wl', 'spots', 'giveaway', 'like', 'and',...","['wl', 'spots', 'giveaway', 'like', 'follow', ...","['wl', 'spots', 'giveaway', 'like', 'follow', ...",0.444,0.0,0.556,0.6808,Positive,1
1,1,1,2022-06-14 08:34:51+00:00,naehrstff_nft,some true words here where in the world are ...,"['', 'some', 'true', 'words', 'here', 'where',...","['true', 'words', 'world', 'friend']","['true', 'words', 'world', 'friend']",0.375,0.0,0.625,0.7184,Positive,1
2,2,2,2022-06-14 08:34:51+00:00,mdzahid85418836,runx airdrop million rnx for k partici...,"['', 'runx', 'airdrop', 'million', 'rnx', 'for...","['runx', 'airdrop', 'million', 'rnx', 'k', 'pa...","['runx', 'airdrop', 'million', 'rnx', 'k', 'pa...",0.198,0.0,0.802,0.5719,Positive,1
3,3,3,2022-06-14 08:34:51+00:00,annelie00284967,discord officially opening gt gt p...,"['', 'discord', 'officially', 'opening', 'gt',...","['discord', 'officially', 'opening', 'gt', 'gt...","['discord', 'officially', 'opening', 'gt', 'gt...",0.422,0.217,0.361,0.6249,Positive,1
4,4,4,2022-06-14 08:34:51+00:00,tkirwi,discord officially opening gt gt p...,"['', 'discord', 'officially', 'opening', 'gt',...","['discord', 'officially', 'opening', 'gt', 'gt...","['discord', 'officially', 'opening', 'gt', 'gt...",0.422,0.217,0.361,0.6249,Positive,1


In [9]:
val=df['Polarity'].value_counts().reset_index()
val.columns=['Polarity','Count']

data=[go.Bar(
  x=val.Polarity,
y=val.Count
)]
layout=go.Layout(
    xaxis=dict(title='Polarity'),
    yaxis=dict(title='Count'))
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

In [10]:
X = df['Clean_text']
y = df['Polarity']

In [11]:
#Splitting Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [12]:
#Count Vectorizer
#Kata-kata ini kemudian perlu dienkode sebagai bilangan bulat, atau nilai floating-point, untuk digunakan sebagai masukan dalam algoritme pembelajaran mesin. Proses ini disebut ekstraksi fitur (atau vektorisasi)

vectorization = CountVectorizer()
Xv_train = vectorization.fit_transform(X_train)
Xv_test = vectorization.transform(X_test)

In [13]:
Xv_train.shape

(750, 1832)

In [14]:
Xv_test.shape

(250, 1832)

In [15]:
#Classifier
mnb = MultinomialNB()
mnb.fit(Xv_train, y_train)

MultinomialNB()

In [16]:
preds = mnb.predict(Xv_test)

In [17]:
mnb.score(Xv_test, y_test)

0.856

In [18]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          -1       0.76      0.76      0.76        76
           1       0.90      0.90      0.90       174

    accuracy                           0.86       250
   macro avg       0.83      0.83      0.83       250
weighted avg       0.86      0.86      0.86       250



In [19]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    preds = mnb.predict(new_xv_test)
    return preds

In [20]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, preds)))

Model accuracy score: 0.8560


In [21]:
y_pred_train = mnb.predict(Xv_train)

y_pred_train

array([ 1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1, -1,  1,  1, -1,
       -1,  1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,
       -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
       -1,  1,  1,  1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,
       -1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1, -1, -1, -1,
       -1,  1,  1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1,
       -1,  1,  1,  1,  1, -1,  1, -1, -1, -1,  1, -1,  1,  1, -1,  1, -1,
        1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,
        1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1,
        1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,
       -1,  1, -1, -1, -1

In [22]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

Training-set accuracy score: 0.9493


In [23]:
#Overfitting dan underfitting
print('Training set accuracy score: {:.4f}'.format(mnb.score(Xv_train, y_train)))

print('Test set accuracy score: {:.4f}'.format(mnb.score(Xv_test, y_test)))

Training set accuracy score: 0.9493
Test set accuracy score: 0.8560
