In [32]:
import pandas as pd
import nltk
from nltk import word_tokenize, WordPunctTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline

In [54]:
train = pd.read_csv('../data/fake_or_real_news_training.csv')
# the submission data has no label
submission = pd.read_csv('../data/fake_or_real_news_test.csv')

In [7]:
stopwords_en = stopwords.words('english')
# define function to eliminate the stopwords
def eliminate_stopwords(wordslist):
    """
    stopwords_en is predefined outside of the function
    """
    clean_list = [i for i in wordslist if i not in stopwords_en]
    return clean_list

In [38]:
# postag
def count_postags(words):
    tagged_words = nltk.pos_tag(words)
    num_noun = 0
    num_verb = 0
    num_adj = 0
    for word in tagged_words:
        if word[1] == 'NN':
            num_noun += 1
        elif word[1] == 'VERB':
            num_verb += 1
        elif word[1] == 'ADJ':
            num_adj += 1
    return num_noun, num_verb, num_adj


In [None]:
# since the training set has 33 rows of data in the wrong format
# we are going to simply remove the rows
print(len(train.label.unique()))
train = train[(train.label == 'REAL')|(train.label == 'FAKE')]

In [19]:
def prepare_data(data):
    """
    preprocessing + feature engineering
    """
    # drop the redundent features -------- not yet
    data = data.drop(['X1','X2'],axis=1)
    # tokenize the title and text
    data['title_token'] = data.title.apply(lambda x:word_tokenize(x))
    data['text_token'] = data.text.apply(lambda x:word_tokenize(x))
    # convert the target variable to 0 and 1
    data.label = data.label.apply(lambda x:1 if x == 'REAL' else 0)

    # eliminate the stopwords in title and text
    data['titletoken_without_stopwords'] = data.title_token.apply(lambda x:eliminate_stopwords(x))
    data['texttoken_without_stopwords'] = data.text_token.apply(lambda x:eliminate_stopwords(x))
    # drop the redundent features
    data = data.drop(['text','title'],axis=1)
    # need to eliminate the punctuation as well? 
    # maybe do it with regex from the original title and text

    ## feature engineering 
    
    # count the postags
    # title
    data['title_postags'] = data.titletoken_without_stopwords.apply(lambda x:count_postags(x))
    data['title_NN_count'] = data.title_postags.map(lambda x:x[0])
    data['title_VERB_count'] = data.title_postags.map(lambda x:x[1])
    data['title_ADJ_count'] = data.title_postags.map(lambda x:x[2])
    # text
    data['text_postags'] = data.texttoken_without_stopwords.apply(lambda x:count_postags(x))
    data['text_NN_count'] = data.text_postags.map(lambda x:x[0])
    data['text_VERB_count'] = data.text_postags.map(lambda x:x[1])
    data['text_ADJ_count'] = data.text_postags.map(lambda x:x[2])
    
    
    
    # create new features describe the length of the title and text, counting by words
    # also maybe try with counting by letters
    # and shall treat them as categorical variables group up with factor levels 
    data['title_length'] = data.title_token.apply(lambda x:len(x))
    data['text_length'] = data.text_token.apply(lambda x:len(x))
    # also highlight the keywords maybe?
    
    
    
    return data

In [20]:
train_df = prepare_data(train)

In [28]:
def splitdata(data, cols):
    """
    cols: a list of columns names to use
    """
    features = data[cols]
    target = data.label
    # first split into (train+test) and for holdout
    TrainingSetX, X_holdout, TrainingSetY, y_holdout = train_test_split(features, target, 
                                                    test_size=0.10, stratify=target, 
                                                    random_state=666)
    # split (train+test) into train and test
    X_train, X_test, y_train, y_test = train_test_split(TrainingSetX, TrainingSetY, 
                                                test_size=0.20, stratify=TrainingSetY, 
                                                random_state=666)
    # train and test use for modeling, holdout use for validating the model
    print("spliting the data......")
    print("shape of train set: ", X_train.shape)
    print("shape of test set: ", X_test.shape)
    print("shape of holdout set: ", X_holdout.shape)
    
    return X_train, y_train, X_test, y_test, X_holdout, y_holdout

In [31]:
cols2model = ['title_token', 'text_token', 'titletoken_without_stopwords', 
              'texttoken_without_stopwords', 'title_length']
x1,y1,x2,y2,x3,y3 = splitdata(train_df, cols = cols2model)

spliting the data......
shape of train set:  (2879, 5)
shape of test set:  (720, 5)
shape of holdout set:  (400, 5)


In [34]:
# random forest classifier
def model_rf(data,cols2model):
    # split data
    X_train, y_train, X_test, y_test, X_holdout, y_holdout = splitdata(train_df, cols = cols2model)
    print('features for modeling :',X_train.columns.tolist())
    # identify the categorical features and the numerical features
    categorical_features = X_train.dtypes == 'category'
    numerical_features = ~categorical_features
    # pipeline
    print('building up pipeline......')
    estimators = [   
                    ('ctf',ColumnTransformer([
                                ('scale',StandardScaler(),
                                         numerical_features),
                                ('enc',OneHotEncoder(categories='auto',handle_unknown='ignore'),
                                         categorical_features),
                                            ])
                    ),
#                     ('poly', PolynomialFeatures(degree=1, include_bias=False, interaction_only=True)),
                    ('RF',RandomForestClassifier(random_state=666,n_estimators=170,max_depth=22,n_jobs=-1))
                ]
    pipe = Pipeline(estimators)
    
    print('training the model......')
    model = pipe.fit(X_train,y_train)
    
    print('training is done')
    score_train = model.score(X_train,y_train) 
    score_test = model.score(X_test,y_test) 
    score_Holdout = model.score(X_holdout,y_holdout) 
    
    print('training dataset mode1 is: %s'%(str(score_train1)))   
    print('test dataset model is: %s'%(str(score_test1)))
    print('Holdout dataset model1 is: %s'%(str(score_H1)))
    
    # in case needs to use the data to do some visualization or some other metrics
    data_pack = (X_train, y_train, X_test, y_test, X_holdout, y_holdout)
    return model, data_pack

In [37]:
# postag
def count_postags(words):
    tagged_words = nltk.pos_tag(words)
    num_noun = 0
    num_verb = 0
    num_adj = 0
    for word in tagged_words:
        if word[1] == 'NN':
            num_noun += 1
        elif word[1] == 'VERB':
            num_verb += 1
        elif word[1] == 'ADJ':
            num_adj += 1
    return num_noun, num_verb, num_adj


[('this', 'DT'), ('an', 'DT'), ('cat', 'NN'), ('dog', 'NN')]
Noun:  2
Verb:  0
Adj:  0


In [68]:
# maybe we can try the neural network?
train[train.X1 == 'FAKE']

Unnamed: 0,ID,title,text,label,X1,X2
308,10194,Who rode it best? Jesse Jackson mounts up to f...,Leonardo DiCaprio to the rescue?,Who rode it best? Jesse Jackson mounts up to f...,FAKE,
911,7375,Shallow 5.4 magnitude earthquake rattles centr...,shakes buildings in Rome,00 UTC © USGS Map of the earthquake's epicent...,FAKE,
1010,9097,ICE Agent Commits Suicide in NYC,Leaves Note Revealing Gov’t Plans to Round-up...,Email Print After writing a lengthy suicide no...,FAKE,
1043,9203,Political Correctness for Yuengling Brewery,What About Our Opioid Epidemic?,We Are Change \n\nIn today’s political climate...,FAKE,
1630,7559,STATE OF GEORGIA FIRES PASTOR BECAUSE OF HIS F...,GOVERNMENT DIDN’T “APPROVE” BIBLICAL SERMONS,Home › SOCIETY | US NEWS › STATE OF GEORGIA FI...,FAKE,
1900,8470,The Amish In America Commit Their Vote To Dona...,Mathematically Guaranteeing Him A Presidentia...,18 SHARE The Amish in America have committed t...,FAKE,
1968,6404,#BREAKING: SECOND Assassination Attempt On Tru...,Suspect Detained (LIVE BLOG),We Are Change \nDonald Trump on Saturday was q...,FAKE,
2176,10499,30th Infantry Division: “Work Horse of the Wes...,The Big Picture TV-211,"Published on Oct 27, 2016 by Jeff Quitney The ...",FAKE,
2493,10492,TOP BRITISH GENERAL WARNS OF NUCLEAR WAR WITH ...,“THE END OF LIFE AS WE KNOW IT”,Paul Joseph Watson Senior British army officer...,FAKE,
2549,10138,Inside The Mind Of An FBI Informant,Terri Linnell Admits Role As Gov’t Snitch,Inside The Mind Of An FBI Informant; Terri Lin...,FAKE,


In [79]:
# train[train.X2 != np.nan]
# if train.X2 == FAKE or REAL, then X1 is text
train.X2.unique()

array([nan, 'REAL', 'FAKE'], dtype=object)

In [74]:
import numpy as np

In [80]:
train[(train.X2 == 'REAL')|(train.X2 == 'FAKE')]

Unnamed: 0,ID,title,text,label,X1,X2
2184,9,Planned Parenthood’s lobbying effort,pay raises for federal workers,and the future Fed rates,PLANNED PARENTHOOD’S LOBBYING GETS AGGRESSIVE....,REAL
3537,6268,Chart Of The Day: Since 2009—–Recovery For The 5%,Stagnation for the 95%,Chart Of The Day: Since 2009 Recovery For The 5%,Stagnation for the 95%,FAKE


In [82]:
train.labe

35

AttributeError: 'DataFrame' object has no attribute 'label'