In [2]:
import numpy as numpy
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import re

In [3]:
data = pd.read_csv('news.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
data.sample(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
1211,8550,Putin: Crimean Integration Into Russian Legal ...,Get short URL 0 23 0 0 The integration of Crim...,FAKE
4801,6226,PENNSYLVANIA GUN STORE tells customers “Muslim...,PENNSYLVANIA GUN STORE tells customers “Muslim...,FAKE
980,9232,I Am A Syrian Living in Syria: “It was Never a...,"Theme: 9/11 &‘War on Terrorism’ , Crimes again...",FAKE
238,10357,Hillary IMPLODES: Trump 'Took Everything...Pai...,Hillary Clinton can’t believe she’s losing.,FAKE
6027,2512,Surprise! Donald Trump is wrong about immigran...,Donald Trump's two-week-old campaign has been ...,REAL


In [5]:
data['label'].value_counts()

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

In [6]:
data.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [7]:
data = data[['title','text', 'label']]
data.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [8]:
data['num_label'] = data['label'].map(
    {
    'REAL': 1,
    'FAKE': 0,
    }
)
data.head()

Unnamed: 0,title,text,label,num_label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1


In [9]:
X = data[['title', 'text']]
X.head()

Unnamed: 0,title,text
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello..."
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T..."
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...


In [10]:
Y = data['num_label']
Y.head()

0    0
1    0
2    1
3    0
4    1
Name: num_label, dtype: int64

In [11]:
def clean_text(text):
    output = text.lower()
    output = re.sub(r'[^A-Za-z0-9 ]+','',output)
    return output
X['title'] = X['title'].apply(clean_text)
X['text'] = X['text'].apply(clean_text)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['title'] = X['title'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['text'] = X['text'].apply(clean_text)


Unnamed: 0,title,text
0,you can smell hillarys fear,daniel greenfield a shillman journalism fellow...
1,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...
2,kerry to go to paris in gesture of sympathy,us secretary of state john f kerry said monday...
3,bernie supporters on twitter erupt in anger ag...,kaydee king kaydeeking november 9 2016 the le...
4,the battle of new york why this primary matters,its primary day in new york and frontrunners h...


In [12]:
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X['title'] + ' ' + X['text'])

In [13]:
X_vec.shape

(6335, 141175)

In [14]:
x_train,x_test,y_train,y_test = train_test_split(X_vec, Y, test_size=0.2, random_state=42, stratify=Y)

In [15]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [16]:
predictions = model.predict(x_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.80      0.81      0.80       633
           1       0.81      0.79      0.80       634

    accuracy                           0.80      1267
   macro avg       0.80      0.80      0.80      1267
weighted avg       0.80      0.80      0.80      1267



In [17]:
joblib.dump(model,'tree_model.pkl')
joblib.dump(vectorizer, 'tree_vectorizer.pkl')

['tree_vectorizer.pkl']

In [20]:
data[2:]['text'].values[0]
data[2:]['title'].values[0]

'Kerry to go to Paris in gesture of sympathy'