In [2]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
#Initialize the datasets
fake_data = pd.read_csv("Fake.csv")
real_data = pd.read_csv("True.csv")

#Create labels for the datasets
fake_data['label'] = "FAKE"
real_data['label'] = "REAL"

#combine the two datasets
data = pd.concat([fake_data,real_data]).reset_index(drop=True)

#Shuffling to randomize the data to avoid biased results
data = data.sample(frac = 1)

#Preprocess data to clean it of unnecessary characters such as punctuations,urls,etc that may confuse the model during training
def preprocess(text):
    text = text.lower() #converts all characters to lowercase for uniformity
    text = re.sub('\[.*?\]','',text) #remove non-alphabetics
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://S+/www\.\S+','',text) #remove urls
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text) #remove punctuations
    text = re.sub('<.*?>+','',text) #remove HTML tags
    text = re.sub('\w*\d\w*','',text) #remove words with digits
    text = re.sub('\n','',text) #remove new lines
    
    return text

#Apply the preprocessing
data['text'] = data['text'].apply(preprocess)

x,y = data['text'], data['label']

#split the dataset for training and testing
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2)

#Initialize the vectorizer
vectorizer = TfidfVectorizer(stop_words = "english", max_df = 0.7)

#vectorize the split sets
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)


In [4]:
#The LinearSVC classifier
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(x_train_vectorized, y_train)
clf.score(x_test_vectorized, y_test)

0.994543429844098

In [5]:
#The LogisticRegression classifier
from sklearn.linear_model import LogisticRegression 
LR = LogisticRegression()
LR.fit(x_train_vectorized, y_train)
LR.score(x_test_vectorized, y_test)

0.9839643652561247

In [6]:
#The DecisionTree classifier
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(x_train_vectorized, y_train)
DT.score(x_test_vectorized, y_test)

0.9964365256124722

In [7]:
#The GradientBoosting classifier
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier(random_state = 0)
GB.fit(x_train_vectorized, y_train)
GB.score(x_test_vectorized, y_test)

0.995879732739421

In [8]:
#The RandomForest classifier
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(random_state = 0)
RF.fit(x_train_vectorized, y_train)
RF.score(x_test_vectorized, y_test)

0.9920935412026726

In [11]:
#Define a function that will produce the predictions given a text input
def Detect(news):
    news_txt = {'text':[news]} #create a dictionary to prepare input for format conversion to a dataframe
    news_data = pd.DataFrame(news_txt) 
    news_data['text'] = news_data['text'].apply(preprocess)
    new_x_test = news_data['text']
    new_x_test_vectorized = vectorizer.transform(new_x_test)
    
    #classifier predictions
    predict_clf = clf.predict(new_x_test_vectorized)
    predict_lr = LR.predict(new_x_test_vectorized)
    predict_dt = DT.predict(new_x_test_vectorized)
    predict_gb = GB.predict(new_x_test_vectorized)
    predict_rf = RF.predict(new_x_test_vectorized)  
    
    result = "\n\nCLF Prediction: {} \nLR Prediction: {} \nDT Prediction: {} \nGB Prediction: {} \nRF Prediction: {}".format(
           predict_clf[0],
           predict_lr[0],
           predict_dt[0],
           predict_gb[0],
           predict_rf[0]
    )
    
    return result

News = str(input())
print(Detect(News))

U.S. House tax chairman proposes tweaks to tax-cut bill,"WASHINGTON (Reuters) - A proposed tax on $2.6 trillion in foreign profits held offshore by U.S. multinational corporations would be raised under an amendment offered on Thursday by the chairman of the U.S. House of Representatives tax committee to his own tax-cut bill. Texas Republican Representative Kevin Brady called for raising the tax to 14 percent for cash and liquid assets and 7 percent for illiquid assets, up from his earlier proposal of 12 percent and 5 percent, respectively. The proposed increase would raise more federal revenue under the tax bill being offered by House Republicans, with Senate Republicans offering a separate bill. ",politicsNews,"November 9, 2017 "														


CLF Prediction: REAL 
LR Prediction: REAL 
DT Prediction: REAL 
GB Prediction: REAL 
RF Prediction: REAL


In [12]:
News = str(input())
print(Detect(News))

 Sheriff David Clarke Becomes An Internet Joke For Threatening To Poke People â€˜In The Eyeâ€™,"On Friday, it was revealed that former Milwaukee Sheriff David Clarke, who was being considered for Homeland Security Secretary in Donald Trump s administration, has an email scandal of his own.In January, there was a brief run-in on a plane between Clarke and fellow passenger Dan Black, who he later had detained by the police for no reason whatsoever, except that maybe his feelings were hurt. Clarke messaged the police to stop Black after he deplaned, and now, a search warrant has been executed by the FBI to see the exchanges.Clarke is calling it fake news even though copies of the search warrant are on the Internet. I am UNINTIMIDATED by lib media attempts to smear and discredit me with their FAKE NEWS reports designed to silence me,  the former sheriff tweeted.  I will continue to poke them in the eye with a sharp stick and bitch slap these scum bags til they get it. I have been attacked 