In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
import regex as re
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
import string
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
def loadCsvs(typeOf):
    if bool(typeOf):
        files = ['True.csv','true_news_stories_for_further_testing.csv']
        true = pd.read_csv(files[0], usecols=[0,1])
        for file in files[1:]:
            mergeFile = pd.read_csv(file, usecols=[0,1])
            mergeFile.columns = ['title', 'text']
            true = pd.concat([true, mergeFile])
        return true
    else: 
        files = ['Fake.csv','fake_news_stories_for_further_testing.csv']
        fake = pd.read_csv(files[0], usecols=[0,1])
        for file in files[1:]:
            mergeFile = pd.read_csv(file, usecols=[0,1])
            mergeFile.columns = ['title', 'text']
            fake = pd.concat([fake, mergeFile])
        return fake
    

In [3]:
true = loadCsvs(1)
fake = loadCsvs(0)
true['class'], fake['class']=[1,0]
stopwords_eng = stopwords.words('english')

In [4]:
df = pd.concat([true, fake])

In [5]:
def normalizeTxt(text):
    if type(text) != str:
        return text
    text = text.lower()
    #removes stopwords, html formatting and numbers
    text = re.sub(r"(((?<=\s|^)(%s)(?=\s|$)|<.*>|\d))" % '|'.join(stopwords_eng),'',text)
    return text

In [19]:
class normalizeDF(BaseEstimator, TransformerMixin):    
    def fit(self, X,y=None):
        return self

    def transform(self, X):
        if type(X)==list:
            return pd.DataFrame({'title':X[0],'text':X[1]}, index=[0])

        print(X, type(X))
        X_new = X.copy()
        X_new.columns = ['title','text']
        X_new = X_new.map(normalizeTxt)
        return X_new

In [20]:
X = df.iloc[:,0:2]
y = df[['class']]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7)

In [22]:
vectorizer = TfidfVectorizer()
ct = ColumnTransformer([
    ('title', vectorizer, 0),
    ('text', vectorizer, 1)
])

In [23]:
pipe = make_pipeline(normalizeDF(),ct, LogisticRegression())
pipe

In [26]:
pipe.fit(X_train, y_train)

                                                   title  \
16207  Iceland opposition leader says will not rule o...   
19961  Hillary Supporters Can Now Add “ANTI-TRUMP” TO...   
1287    Look At All The Trump Supporters At The Rally...   
10941  Judges find two North Carolina congressional d...   
15248  For some Palestinians in love, this slum is th...   
...                                                  ...   
18564  From batons to barbecues, Catalan vote exposes...   
4678    Poll: Clinton Leads Trump By 10 Points, Obama...   
23329  DALLAS MAIDAN: Staged Snipers Designed to Infl...   
10949  Michigan governor: solve Flint water crisis in...   
21556  SHOCKING ACT OF “TOLERANCE”: TRANSGENDER THUG ...   

                                                    text  
16207  REYKJAVIK (Reuters) - The leader of Iceland s ...  
19961  How many Hillary supporters does it take to fi...  
1287   The Trump administration was going to show Dem...  
10941  (Reuters) - Two of North Carolina’s 

  y = column_or_1d(y, warn=True)


In [12]:
accuracy_score(pipe.predict(X_test), y_test)

0.9931294326241135