In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk import word_tokenize, WordPunctTokenizer, pos_tag
from nltk.corpus import stopwords
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.compose import ColumnTransformer, make_column_transformer
import re
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

In [3]:
# train_x = pd.read_csv('https://gist.githubusercontent.com/ScottPanIE/4dd8b7b85b44c18baf556d95b9093bdc/raw/88c4bfc393fa1fa004c0ceaceeedc6f84448f8be/fake_or_real_news_training.csv')

In [4]:
# train = pd.read_csv('../data/fake_or_real_news_training.csv')
# # the submission data has no label
# submission = pd.read_csv('../data/fake_or_real_news_test.csv')

In [8]:
def rearrange(df):
    for row in range(len(df)):
        if not pd.isna(df.iloc[row, 5]):
            # Concatenate according columns
            df.iloc[row, 1] = df.iloc[row, 1] + df.iloc[row, 2] + df.iloc[row, 3]
            df.iloc[row, 2] = df.iloc[row, 4]
            df.iloc[row, 3] = df.iloc[row, 5]
        elif not pd.isna(df.iloc[row, 4]):
            df.iloc[row, 1] = df.iloc[row, 1] + df.iloc[row, 2]
            df.iloc[row, 2] = df.iloc[row, 3]
            df.iloc[row, 3] = df.iloc[row, 4]
    df = df.drop(['X1', 'X2'], axis = 1)
    return df

In [9]:
porter=PorterStemmer()
def stemSentence(sentence):
    token_words=word_tokenize(sentence.lower())
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [10]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

lemmatizer = WordNetLemmatizer()
def lemma(text):    
    tagged_sentence = pos_tag(word_tokenize(text))
    lemm = [lemmatizer.lemmatize(word, pos=penn_to_wn(tag)) for word, tag in tagged_sentence if penn_to_wn(tag) != None ]
    return " ".join(lemm)

In [11]:
def prepare_data(data):
    """
    processing data, on text and title
    """
    # fix X1 X2 issue
    data = rearrange(data)
    # tokenize the title and text
#     data['title'] = data.title.apply(lambda x: " ".join(word_tokenize(x.lower())))
#     data['text'] = data.text.apply(lambda x: " ".join(word_tokenize(x.lower())))
#     data['text_lemm'] = data.text.apply(lambda x: " ".join(word_tokenize(x.lower())))
    data['text_lemm'] = data.text.apply(lambda x: lemma(x))
    data['text_lemm'] = data.text_lemm.apply(lambda x: re.sub("\d+", "", str(x)))

    # add title into text
    data['title_hash'] = data.title.apply(lambda x: " ".join(["TITLE_"+w for w in word_tokenize(x)]))
#     # concatenate two columns
    data['whole'] = data['title_hash'] + data['text_lemm']
#     data = data.drop(['title','text'], axis=1)
    
#     data['whole'] = data.whole.apply(lambda x: stemSentence(x))
#     # remove numbers from the text
#     data['whole'] = data.whole.apply(lambda x: re.sub("\d+", "", x))
    # convert the target variable to 0 and 1
    data.label = data.label.apply(lambda x: 1 if x == 'REAL' else 0)
    return data

In [None]:
# reload data
train_0 = pd.read_csv('../data/fake_or_real_news_training.csv')
train_1 = prepare_data(train_0)

In [None]:
train_1.head()

In [None]:
# split data
def split_data(data, feature='whole'):
    X_train, X_test, y_train, y_test = train_test_split(data[feature], data['label'],
                                                        test_size = 0.2, random_state = 7)
    return X_train, X_test, y_train, y_test

In [None]:
# vectorize
def vectorize_select(selection="tfidf", max_df=0.8, min_df=1):
    """
    "tfidf":"TfidVectorizer"
    "count":"CountVectorizer"
    "hash":"HashingVectorizer"
    """
    if selection == "tfidf":
        return TfidfVectorizer(stop_words='english', max_df=max_df, min_df=min_df,binary=True,
                              lowercase=False)
    elif selection == "count":
        return CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df)
    elif selection == "hash":
        return HashingVectorizer(stop_words='english')
    else:
        raise Exception("{} can't be found".format(selection))


In [215]:
def PAC(data, selection='tfidf', vectorize_max_df=0.8, vectorize_min_df=1, feature='whole'):
    # split data
    X_train, X_test, y_train, y_test = split_data(data, feature=feature)
    # vectorizer: selection: 'tfidf','count','hash'
    vectorizer = vectorize_select(selection, max_df=vectorize_max_df, min_df=vectorize_min_df)
    # transform data
    vectorize_train = vectorizer.fit_transform(X_train)
    vectorize_test = vectorizer.transform(X_test)
    # model 
    linear_clf = PassiveAggressiveClassifier(random_state=666, max_iter=100, n_iter_no_change=10,
                                             tol=1e-3,
                                             early_stopping=True, validation_fraction=0.1)
    linear_clf.fit(vectorize_train, y_train)
    pred = linear_clf.predict(vectorize_test)
    acc = accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % acc)
    print(confusion_matrix(y_test, pred, labels=[0, 1]))
    
    return linear_clf, vectorizer

In [216]:
m = PAC(train_1,feature='text')

accuracy:   0.954
[[383  22]
 [ 15 380]]
