In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk import word_tokenize, WordPunctTokenizer, pos_tag
from nltk.corpus import stopwords
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.compose import ColumnTransformer, make_column_transformer
import re
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

In [2]:
# train_x = pd.read_csv('https://gist.githubusercontent.com/ScottPanIE/4dd8b7b85b44c18baf556d95b9093bdc/raw/88c4bfc393fa1fa004c0ceaceeedc6f84448f8be/fake_or_real_news_training.csv')

In [3]:
# train = pd.read_csv('../data/fake_or_real_news_training.csv')
# # the submission data has no label
# submission = pd.read_csv('../data/fake_or_real_news_test.csv')

In [2]:
def rearrange(df):
    for row in range(len(df)):
        if not pd.isna(df.iloc[row, 5]):
            # Concatenate according columns
            df.iloc[row, 1] = df.iloc[row, 1] + df.iloc[row, 2] + df.iloc[row, 3]
            df.iloc[row, 2] = df.iloc[row, 4]
            df.iloc[row, 3] = df.iloc[row, 5]
        elif not pd.isna(df.iloc[row, 4]):
            df.iloc[row, 1] = df.iloc[row, 1] + df.iloc[row, 2]
            df.iloc[row, 2] = df.iloc[row, 3]
            df.iloc[row, 3] = df.iloc[row, 4]
    df = df.drop(['X1', 'X2'], axis = 1)
    return df

In [5]:
porter=PorterStemmer()
def stemSentence(sentence):
    token_words=word_tokenize(sentence.lower())
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [6]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

lemmatizer = WordNetLemmatizer()
def lemma(text):    
    tagged_sentence = pos_tag(word_tokenize(text))
    lemm = [lemmatizer.lemmatize(word, pos=penn_to_wn(tag)) for word, tag in tagged_sentence if penn_to_wn(tag) != None ]
    return " ".join(lemm)

In [3]:
def prepare_data(data):
    """
    processing data, on text and title
    """
    # fix X1 X2 issue
    data = rearrange(data)
    # tokenize the title and text
#     data['title'] = data.title.apply(lambda x: " ".join(word_tokenize(x.lower())))
#     data['text'] = data.text.apply(lambda x: " ".join(word_tokenize(x.lower())))
#     data['text_lemm'] = data.text.apply(lambda x: " ".join(word_tokenize(x.lower())))
    data

#     data['text_lemm'] = data.text.apply(lambda x: lemma(x))
#     data['text_lemm'] = data.text_lemm.apply(lambda x: re.sub("\d+", "", str(x)))

#     # add title into text
#     data['title_hash'] = data.title.apply(lambda x: " ".join(["TITLE_"+w for w in word_tokenize(x)]))
# #     # concatenate two columns
#     data['whole'] = data['title_hash'] + data['text_lemm']
# #     data = data.drop(['title','text'], axis=1)
    
    
#     data['whole'] = data.whole.apply(lambda x: stemSentence(x))
#     # remove numbers from the text
#     data['whole'] = data.whole.apply(lambda x: re.sub("\d+", "", x))
    # convert the target variable to 0 and 1
    data.label = data.label.apply(lambda x: 1 if x == 'REAL' else 0)
    return data

In [4]:
# reload data
train_0 = pd.read_csv('../data/fake_or_real_news_training.csv')
train_1 = prepare_data(train_0)

In [5]:
train_1.head()

Unnamed: 0,ID,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


In [6]:
# split data
def split_data(data, feature='whole', random_state = 7):
    X_train, X_test, y_train, y_test = train_test_split(data[feature], data['label'],
                                                        test_size = 0.2, random_state = random_state)
    return X_train, X_test, y_train, y_test

In [7]:
# vectorize
def vectorize_select(selection="tfidf", max_df=0.8, min_df=1):
    """
    "tfidf":"TfidVectorizer"
    "count":"CountVectorizer"
    "hash":"HashingVectorizer"
    """
    if selection == "tfidf":
        return TfidfVectorizer(stop_words='english', max_df=max_df, min_df=min_df,binary=True,
                              lowercase=False)
    elif selection == "count":
        return CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df)
    elif selection == "hash":
        return HashingVectorizer(stop_words='english')
    else:
        raise Exception("{} can't be found".format(selection))


In [8]:
def PAC(data, selection='tfidf', vectorize_max_df=0.8, vectorize_min_df=1, feature='whole'):
    # split data
    X_train, X_test, y_train, y_test = split_data(data, feature=feature)
    # vectorizer: selection: 'tfidf','count','hash'
    vectorizer = vectorize_select(selection, max_df=vectorize_max_df, min_df=vectorize_min_df)
    # transform data
    vectorize_train = vectorizer.fit_transform(X_train)
    vectorize_test = vectorizer.transform(X_test)
    # model 
    linear_clf = PassiveAggressiveClassifier(random_state=666, max_iter=100, n_iter_no_change=10,
                                             tol=1e-3,
                                             early_stopping=True, validation_fraction=0.1)
    linear_clf.fit(vectorize_train, y_train)
    pred = linear_clf.predict(vectorize_test)
    acc = accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % acc)
    print(confusion_matrix(y_test, pred, labels=[0, 1]))
    
    return linear_clf, vectorizer

In [13]:
m = PAC(train_1,feature='text')

accuracy:   0.954
[[383  22]
 [ 15 380]]


In [20]:
# passive aggressive classifier with grid search cv
def PAC_T(data, selection='tfidf', feature='text'):
    # split data
    X_train, X_test, y_train, y_test = split_data(data, feature=feature, random_state=666)
    estimators = [   
                    ('TF',TfidfVectorizer(stop_words='english', binary=True, lowercase=False,
                                          max_df=0.8,
                                         norm='l2',sublinear_tf=False)
                    ),
                    ('PA',PassiveAggressiveClassifier(random_state=666, max_iter=1000, 
                                                      n_iter_no_change=10, tol=1e-3, 
                                                      early_stopping=True, loss='squared_hinge',
                                                      fit_intercept= False,
                                                      validation_fraction=0.1))
                ]
    pipe = Pipeline(estimators)
    param_RF = {
                'TF__min_df':[4],
                'TF__ngram_range': [(1,4)]
                }
    
    # apply the estimators and parameters in pipeline
    gridPipe = GridSearchCV(pipe, param_RF, cv=5, return_train_score=True)
    model = gridPipe.fit(X_train, y_train)

    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % acc)
    print(confusion_matrix(y_test, pred, labels=[0, 1]))
    
    return model

In [30]:
m2 = PAC_T(prepare_dataFFF(train_0),feature='text')

accuracy:   0.951
[[379  20]
 [ 19 382]]


In [31]:
m2.best_estimator_

Pipeline(memory=None,
     steps=[('TF', TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=0.8, max_features=None, min_df=4,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
  ...      shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False))])

In [32]:
# Slight modifications to the preparation-function
# because the submission data has a slightly different structure
submission = pd.read_csv('../data/fake_or_real_news_test.csv')
# m2 is the model from the PAC function
submission = prepare_dataFFF(submission)
submission['prediction'] = m2.predict(submission['text'])
submission.prediction[submission.prediction==0] = 'FAKE'
submission.prediction[submission.prediction==1] = 'REAL'
submission = submission.set_index('ID')

# answer is the dataset from kaggle
# this is the dataset downloaded from kaggle so I didn't put it inside the repo
answer = pd.read_csv('../../fake_or_real_news.csv')
answer.rename(columns={'Unnamed: 0':'ID'}, inplace=True)
answer = answer.set_index('ID')

# left join on submission with prediction
submission2 = submission.join(answer[['label']],how='left')

# metrics
acc = accuracy_score(submission2.label, submission2.prediction)
print("accuracy:   %0.3f" % acc)
print(confusion_matrix(submission2.label, submission2.prediction, labels=['FAKE', 'REAL']))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


accuracy:   0.950
[[1093   71]
 [  45 1112]]


In [29]:
def count_date(x):
    datelist = re.findall(r'\w+\s\d{1,2},\s\d{4}', str(x))
    if len(datelist) > 0:
        dateinfo = " HAS_DATE"
    else:
        dateinfo = " NO_DATE"
    return x + dateinfo

def prepare_dataFFF(data):
    """
    processing data, on text and title
    """
    # fix X1 X2 issue
    if 'X1' in data.columns:
        data = rearrange(data)
    # tokenize the title and text
#     data['title'] = data.title.apply(lambda x: " ".join(word_tokenize(x.lower())))
#     data['text'] = data.text.apply(lambda x: " ".join(word_tokenize(x.lower())))
#     data['text_lemm'] = data.text.apply(lambda x: " ".join(word_tokenize(x.lower())))
    data['text_edit'] = data.text.apply(lambda x: re.sub("\\n", "", str(x)))
    data['text_edit'] = data.text_edit.apply(lambda x: count_date(x))

#     data['text_lemm'] = data.text.apply(lambda x: lemma(x))
#     data['text_lemm'] = data.text_lemm.apply(lambda x: re.sub("\d+", "", str(x)))

#     # add title into text
#     data['title_hash'] = data.title.apply(lambda x: " ".join(["TITLE_"+w for w in word_tokenize(x)]))
# #     # concatenate two columns
#     data['whole'] = data['title_hash'] + data['text_lemm']
# #     data = data.drop(['title','text'], axis=1)
    
    
#     data['whole'] = data.whole.apply(lambda x: stemSentence(x))
#     # remove numbers from the text
#     data['whole'] = data.whole.apply(lambda x: re.sub("\d+", "", x))
    # convert the target variable to 0 and 1
    if 'label' in data.columns:
        data.label = data.label.apply(lambda x: 1 if x == 'REAL' else 0)
    return data