In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection         import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model            import LogisticRegression
from sklearn.metrics                 import accuracy_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
RAND_SEED = 1502

/kaggle/input/fake-news-classification/WELFake_Dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/fake-news-classification/WELFake_Dataset.csv')

### Label: 0 (fake) and 1 (real)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### There can be check that rows with missing information, lack 'title' column text
#### Therefore, we can clean up data like this:

In [4]:
data = df.loc[df.notnull().all(axis=1)]

In [5]:
data['text'] = data['title'] + ' ' + data['text']
data.dropna(subset=['text', 'title'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['title'] + ' ' + data['text']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['text', 'title'], inplace=True)


In [6]:
X = data['text']
y = data['label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state = RAND_SEED)

In [8]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features  = feature_extraction.fit_transform(X_test)

y_train = y_train.astype('int')
y_test  =  y_test.astype('int')

In [9]:
model = LogisticRegression()

In [10]:
model.fit(X_train_features, y_train)

train_prediction = model.predict(X_train_features)
accuracy_train = accuracy_score(y_train, train_prediction)

print('Accuracy score on train data: ', int(accuracy_train * 1000) / 10)

Accuracy score on train data:  96.3


In [11]:
model.fit(X_test_features, y_test)

test_prediction =  model.predict(X_test_features)
accuracy_test   = accuracy_score(y_test, test_prediction)

print('Accuracy score on test data: ', int(accuracy_test * 1000) / 10)

Accuracy score on test data:  95.4


In [12]:
message = ["WASHINGTON — The Supreme Court rejected on Monday an appeal from Texas officials seeking to restore"]
message_features = feature_extraction.transform(message)
prediction = model.predict(message_features)

if prediction[0] == 0:
    print("SPAM")
else:
    print("AUTHENTIC")

SPAM
