In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('news.csv')

In [3]:
dataset

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,1
1,U.S. conservative leader optimistic of common ...,0
2,"Trump proposes U.S. tax overhaul, stirs concer...",0
3,Court Forces Ohio To Allow Millions Of Illega...,1
4,Democrats say Trump agrees to work on immigrat...,0
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,1
9896,Trump consults Republican senators on Fed chie...,0
9897,Trump lawyers say judge lacks jurisdiction for...,0
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,1


In [4]:
dataset.shape

(9900, 2)

In [5]:
dataset.isnull().sum()

Text     0
label    0
dtype: int64

In [6]:
dataset['label'].value_counts()

label
1    5000
0    4900
Name: count, dtype: int64

In [7]:
dataset = dataset.sample(frac = 1)

In [8]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
ps = WordNetLemmatizer()

In [10]:
stopwords = stopwords.words('english')

In [11]:
def clean_row_dataset(row):
    row = row.lower()
    row = re.sub('[^a-zA-Z]', ' ', row)
    token = row.split()
    news = [ps.lemmatize(word) for word in token if not word in stopwords]
    cleanned_news = ' '.join(news)
    return cleanned_news

In [12]:
dataset['Text']

5992    House committee passes sweeping tax bill WASHI...
4578    Alabama Senate race winner urges Republican ri...
5912     WATCH: Republican House Oversight Chair Liter...
2588     Trump Can’t Find Anyone To Serve On His Natio...
9645    U.S. Senate panel targets Chinese banks with N...
                              ...                        
6377    Senate Republicans tie tax plan to repeal of k...
2863     Prince Charles Reminds Everyone What the 1930...
5729    Trump to nominate Senate aide for Federal Ener...
7702     Students With Trump Sign March Down Halls Cha...
4046    China defends ally Pakistan after Trump critic...
Name: Text, Length: 9900, dtype: object

In [13]:
dataset['Text'] = dataset['Text'].apply(lambda x : clean_row_dataset(x))

In [14]:
dataset['Text']

5992    house committee pass sweeping tax bill washing...
4578    alabama senate race winner urge republican riv...
5912    watch republican house oversight chair literal...
2588    trump find anyone serve national security team...
9645    u senate panel target chinese bank north korea...
                              ...                        
6377    senate republican tie tax plan repeal key obam...
2863    prince charles reminds everyone like taking sh...
5729    trump nominate senate aide federal energy regu...
7702    student trump sign march hall chanting white p...
4046    china defends ally pakistan trump criticism be...
Name: Text, Length: 9900, dtype: object

In [15]:
X = dataset.iloc[:9900, 0]
y = dataset.iloc[:9900, 1]

In [16]:
X

5992    house committee pass sweeping tax bill washing...
4578    alabama senate race winner urge republican riv...
5912    watch republican house oversight chair literal...
2588    trump find anyone serve national security team...
9645    u senate panel target chinese bank north korea...
                              ...                        
6377    senate republican tie tax plan repeal key obam...
2863    prince charles reminds everyone like taking sh...
5729    trump nominate senate aide federal energy regu...
7702    student trump sign march hall chanting white p...
4046    china defends ally pakistan trump criticism be...
Name: Text, Length: 9900, dtype: object

In [17]:
y

5992    0
4578    0
5912    1
2588    1
9645    0
       ..
6377    0
2863    1
5729    0
7702    1
4046    0
Name: label, Length: 9900, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
train_data, test_data, train_label, test_label = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [20]:
vectorizer = TfidfVectorizer(max_features=10000, lowercase=False, ngram_range=(1, 2))

In [21]:
vec_train_data = vectorizer.fit_transform(train_data)

In [22]:
vec_test_data = vectorizer.fit_transform(test_data)

In [23]:
vec_train_data = vec_train_data.toarray()

In [24]:
vec_test_data = vec_test_data.toarray()

In [25]:
vec_train_data.shape, vec_test_data.shape

((7920, 10000), (1980, 10000))

In [26]:
vec_train_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
training_data = pd.DataFrame(vec_train_data, columns = vectorizer.get_feature_names_out())
testing_data = pd.DataFrame(vec_train_data, columns = vectorizer.get_feature_names_out())

In [28]:
testing_data

Unnamed: 0,aaron,abadi,abandon,abandoned,abbas,abc,abc news,abc week,abe,abide,...,younger,youth,youtube,zach,zach gibson,zero,zinke,zinke said,zone,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model

In [29]:
from sklearn.svm import SVC

In [30]:
classifier = SVC(kernel = 'linear', random_state = 0)

In [31]:
classifier.fit(vec_train_data, train_label)
classifier.fit(vec_test_data, test_label)

In [32]:
y_pred_train = classifier.predict(vec_train_data)
y_pred_test = classifier.predict(vec_test_data)

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(train_label, y_pred_train)

0.6941919191919191

In [35]:
accuracy_score(test_label, y_pred_test)

1.0

In [36]:
txt = input('Enter the news')
news = clean_row_dataset(txt)
transformed_news = vectorizer.transform([news]).toarray()
pred = classifier.predict(transformed_news)

Enter the news Democrats say Trump agrees to work on immigration bill, wall in dispute WASHINGTON (Reuters) - President Donald Trump and Democratic leaders in the U.S. Congress have agreed to work together on legislation to protect â€œDreamers,â€ the illegal immigrants who were children when they entered the United States, the lawmakers said on Wednesday, although a dispute erupted over exactly what had been agreed. Following a dinner with Trump at the White House, Senate Democratic leader Chuck Schumer and House of Representatives Democratic leader Nancy Pelosi said the â€œproductive meetingâ€ focused on â€œDACA,â€ a program established by former President Barack Obama. â€œWe agreed to enshrine the protections of DACA (Deferred Action for Childhood Arrivals) into law quickly, and to work out a package of border security, excluding the wall, thatâ€™s acceptable to both sides,â€ Schumer and Pelosi said in a statement. All year, Democrats have insisted that they will block any legisl

In [37]:
if pred == 0:
    print('News is Correct')
else:
    print('News is fake')

News is Correct
