In [1]:
import numpy as np
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
news_df = pd.read_csv('data.csv')

In [3]:
news_df.head()

Unnamed: 0,id,label,date,title,summary,source
0,83,1,2023-03-04 06:18:13+00,"Depo Plumpang Terbakar, Anggota DPR Minta Pert...",Anggota Komisi VII DPR RI Rofik Hananto menyay...,tempo
1,84,0,2023-03-04 06:04:38+00,Jokowi Perintahkan Wapres Ma'ruf Amin Tinjau L...,Presiden Joko Widodo telah memerintahkan Wakil...,tempo
2,85,0,2023-03-04 06:18:04+00,HNW Mendukung Jamaah Umroh First Travel Dapatk...,Wakil Ketua MPR RI Dr. H. M. Hidayat Nur Wahid...,tempo
3,86,0,2023-03-04 06:44:10+00,Tim Dokkes Polri Telah Terima 14 Kantong Jenaz...,Tim Kedokteran dan Kesehatan (Dokkes) Polri te...,tempo
4,87,0,2023-03-04 06:38:57+00,Bamsoet Ajak Komunitas Otomotif Kembangkan Per...,Ketua MPR RI Bambang Soesatyo telah diangkat s...,tempo


In [4]:
news_df.isnull().sum()

id          0
label       0
date        0
title       0
summary    10
source      0
dtype: int64

In [5]:
news_df.shape

(22009, 6)

In [6]:
news_df = news_df.fillna(' ')

In [7]:
news_df.isnull().sum()

id         0
label      0
date       0
title      0
summary    0
source     0
dtype: int64

In [8]:
news_df['konten'] = news_df['title']+' '+news_df['summary']

In [9]:
news_df

Unnamed: 0,id,label,date,title,summary,source,konten
0,83,1,2023-03-04 06:18:13+00,"Depo Plumpang Terbakar, Anggota DPR Minta Pert...",Anggota Komisi VII DPR RI Rofik Hananto menyay...,tempo,"Depo Plumpang Terbakar, Anggota DPR Minta Pert..."
1,84,0,2023-03-04 06:04:38+00,Jokowi Perintahkan Wapres Ma'ruf Amin Tinjau L...,Presiden Joko Widodo telah memerintahkan Wakil...,tempo,Jokowi Perintahkan Wapres Ma'ruf Amin Tinjau L...
2,85,0,2023-03-04 06:18:04+00,HNW Mendukung Jamaah Umroh First Travel Dapatk...,Wakil Ketua MPR RI Dr. H. M. Hidayat Nur Wahid...,tempo,HNW Mendukung Jamaah Umroh First Travel Dapatk...
3,86,0,2023-03-04 06:44:10+00,Tim Dokkes Polri Telah Terima 14 Kantong Jenaz...,Tim Kedokteran dan Kesehatan (Dokkes) Polri te...,tempo,Tim Dokkes Polri Telah Terima 14 Kantong Jenaz...
4,87,0,2023-03-04 06:38:57+00,Bamsoet Ajak Komunitas Otomotif Kembangkan Per...,Ketua MPR RI Bambang Soesatyo telah diangkat s...,tempo,Bamsoet Ajak Komunitas Otomotif Kembangkan Per...
...,...,...,...,...,...,...,...
22004,44725,0,2023-04-02 05:46:00+00,Indonesia Batal Jadi Tuan Rumah Piala Dunia U-...,Jokowi mengaku pusing dengan urusan Piala Duni...,tempo,Indonesia Batal Jadi Tuan Rumah Piala Dunia U-...
22005,44726,0,2023-04-02 05:58:27+00,Rentetan Kebakaran Pertamina 2023: Depo Plumpa...,Sejumlah kebakaran terjadi pada 2023 yang terk...,tempo,Rentetan Kebakaran Pertamina 2023: Depo Plumpa...
22006,44733,0,2023-04-02 06:00:00+00,Update Ledakan di Kilang Dumai: Pertamina Foku...,PT Kilang Pertamina Internasional (PT KPI) Ref...,cnbcindonesia,Update Ledakan di Kilang Dumai: Pertamina Foku...
22007,44745,0,2023-04-02 06:09:55+00,Rilis Lagu Marhaban Yaa Ramadhan Ayu Azhari Li...,"Ayu Ting-Ting merilis single religi ""Marhaban ...",kumparan,Rilis Lagu Marhaban Yaa Ramadhan Ayu Azhari Li...


In [10]:
X = news_df.drop('label',axis=1)
y = news_df['label']

In [11]:
print(X)

          id                    date  \
0         83  2023-03-04 06:18:13+00   
1         84  2023-03-04 06:04:38+00   
2         85  2023-03-04 06:18:04+00   
3         86  2023-03-04 06:44:10+00   
4         87  2023-03-04 06:38:57+00   
...      ...                     ...   
22004  44725  2023-04-02 05:46:00+00   
22005  44726  2023-04-02 05:58:27+00   
22006  44733  2023-04-02 06:00:00+00   
22007  44745  2023-04-02 06:09:55+00   
22008  44746  2023-04-02 06:05:27+00   

                                                   title  \
0      Depo Plumpang Terbakar, Anggota DPR Minta Pert...   
1      Jokowi Perintahkan Wapres Ma'ruf Amin Tinjau L...   
2      HNW Mendukung Jamaah Umroh First Travel Dapatk...   
3      Tim Dokkes Polri Telah Terima 14 Kantong Jenaz...   
4      Bamsoet Ajak Komunitas Otomotif Kembangkan Per...   
...                                                  ...   
22004  Indonesia Batal Jadi Tuan Rumah Piala Dunia U-...   
22005  Rentetan Kebakaran Pertamina 202

In [12]:
ps = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
news_df['konten'] = news_df['konten'].apply(stemming)

In [14]:
news_df['konten']

0        depo plumpang terbakar anggota dpr minta perta...
1        jokowi perintahkan wapr ruf amin tinjau lokasi...
2        hnw mendukung jamaah umroh first travel dapatk...
3        tim dokk polri telah terima kantong jenazah ko...
4        bamsoet ajak komunita otomotif kembangkan pere...
                               ...                        
22004    indonesia batal jadi tuan rumah piala dunia u ...
22005    rentetan kebakaran pertamina depo plumpang kap...
22006    updat ledakan di kilang dumai pertamina foku p...
22007    rili lagu marhaban yaa ramadhan ayu azhari lib...
22008    polisi bakal ekshumasi jenazah istri yang dira...
Name: konten, Length: 22009, dtype: object

In [15]:
X = news_df['konten'].values
y = news_df['label'].values

In [16]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [17]:
print(X)

  (0, 37138)	0.15458624525015574
  (0, 36958)	0.09475166694700932
  (0, 35598)	0.07926465709832897
  (0, 35463)	0.04816033708235881
  (0, 35182)	0.12267196566140516
  (0, 35094)	0.13981669189729012
  (0, 34959)	0.11713933748490761
  (0, 34207)	0.07344092189336457
  (0, 30248)	0.41936577653209967
  (0, 30022)	0.07369382139356111
  (0, 28048)	0.3571708583040855
  (0, 27511)	0.20064029247816492
  (0, 25328)	0.12157194085832829
  (0, 25313)	0.23559140189658995
  (0, 24787)	0.04041653356296567
  (0, 22727)	0.107269719161015
  (0, 22719)	0.09742753477600082
  (0, 22448)	0.08520712444164671
  (0, 22431)	0.20968288826604983
  (0, 22052)	0.1379373102516397
  (0, 20930)	0.08890758738742327
  (0, 20080)	0.09061688618321227
  (0, 19356)	0.07017401624875215
  (0, 16913)	0.09157449835832399
  (0, 15173)	0.10460272064473308
  :	:
  (22008, 17175)	0.24530438648834113
  (22008, 15985)	0.08212208595988277
  (22008, 15703)	0.10064698896050997
  (22008, 15011)	0.054805506765301826
  (22008, 14891)	0.04419

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)

In [19]:
X_train.shape

(17607, 38356)

In [20]:
model = LogisticRegression()
model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [21]:
# on training set
train_y_pred = model.predict(X_train)
print(accuracy_score(train_y_pred,Y_train))

0.7736695632418924


In [22]:
# on testing set
testing_y_pred = model.predict(X_test)
print(accuracy_score(testing_y_pred,Y_test))

0.5390731485688324


In [23]:
input_data = X_test[10]
prediction = model.predict(input_data)

In [24]:
if prediction[0] == 0:
    print('The News Is Real')
else:
    print('The News is Fake')

The News Is Real


In [25]:
news_df['konten'][2]

'hnw mendukung jamaah umroh first travel dapatkan haknya wakil ketua mpr ri dr h hidayat nur wahid atau hnw menerima kunjungan perwakilan korban kasu biro perjalanan haji dan umroh first travel yang tergabung dalam paguyuban first travel indonesia pertemuan tersebut berlangsung di ruang kerja wakil ketua mpr gedung n'