In [32]:
import pandas as pd
from nltk import word_tokenize, WordPunctTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline

In [2]:
train = pd.read_csv('../data/fake_or_real_news_training.csv')
# the submission data has no label
submission = pd.read_csv('../data/fake_or_real_news_test.csv')

In [7]:
stopwords_en = stopwords.words('english')
# define function to eliminate the stopwords
def eliminate_stopwords(wordslist):
    """
    stopwords_en is predefined outside of the function
    """
    clean_list = [i for i in wordslist if i not in stopwords_en]
    return clean_list

In [19]:
def prepare_data(data):
    """
    preprocessing + feature engineering
    """
    # drop the redundent features
    data = data.drop(['X1','X2'],axis=1)
    # tokenize the title and text
    data['title_token'] = data.title.apply(lambda x:word_tokenize(x))
    data['text_token'] = data.text.apply(lambda x:word_tokenize(x))
    # convert the target variable to 0 and 1
    data.label = data.label.apply(lambda x:1 if x == 'REAL' else 0)

    # eliminate the stopwords in title and text
    data['titletoken_without_stopwords'] = data.title_token.apply(lambda x:eliminate_stopwords(x))
    data['texttoken_without_stopwords'] = data.text_token.apply(lambda x:eliminate_stopwords(x))
    # drop the redundent features
    data = data.drop(['text','title'],axis=1)
    # need to eliminate the punctuation as well? 
    # maybe do it with regex from the original title and text

    ## feature engineering 
    # create new features describe the length of the title and text, counting by words
    # also maybe try with counting by letters
    # and shall treat them as categorical variables group up with factor levels 
    data['title_length'] = data.title_token.apply(lambda x:len(x))
    data['text_length'] = data.text_token.apply(lambda x:len(x))
    # also highlight the keywords maybe?
    
    
    
    return data

In [20]:
train_df = prepare_data(train)

In [28]:
def splitdata(data, cols):
    """
    cols: a list of columns names to use
    """
    features = data[cols]
    target = data.label
    # first split into (train+test) and for holdout
    TrainingSetX, X_holdout, TrainingSetY, y_holdout = train_test_split(features, target, 
                                                    test_size=0.10, stratify=target, 
                                                    random_state=666)
    # split (train+test) into train and test
    X_train, X_test, y_train, y_test = train_test_split(TrainingSetX, TrainingSetY, 
                                                test_size=0.20, stratify=TrainingSetY, 
                                                random_state=666)
    # train and test use for modeling, holdout use for validating the model
    print("spliting the data......")
    print("shape of train set: ", X_train.shape)
    print("shape of test set: ", X_test.shape)
    print("shape of holdout set: ", X_holdout.shape)
    
    return X_train, y_train, X_test, y_test, X_holdout, y_holdout

In [31]:
cols2model = ['title_token', 'text_token', 'titletoken_without_stopwords', 
              'texttoken_without_stopwords', 'title_length']
x1,y1,x2,y2,x3,y3 = splitdata(train_df, cols = cols2model)

spliting the data......
shape of train set:  (2879, 5)
shape of test set:  (720, 5)
shape of holdout set:  (400, 5)


In [34]:
# random forest classifier
def model_rf(data,cols2model):
    # split data
    X_train, y_train, X_test, y_test, X_holdout, y_holdout = splitdata(train_df, cols = cols2model)
    print('features for modeling :',X_train.columns.tolist())
    # identify the categorical features and the numerical features
    categorical_features = X_train.dtypes == 'category'
    numerical_features = ~categorical_features
    # pipeline
    print('building up pipeline......')
    estimators = [   
                    ('ctf',ColumnTransformer([
                                ('scale',StandardScaler(),
                                         numerical_features),
                                ('enc',OneHotEncoder(categories='auto',handle_unknown='ignore'),
                                         categorical_features),
                                            ])
                    ),
#                     ('poly', PolynomialFeatures(degree=1, include_bias=False, interaction_only=True)),
                    ('RF',RandomForestClassifier(random_state=666,n_estimators=170,max_depth=22,n_jobs=-1))
                ]
    pipe = Pipeline(estimators)
    
    print('training the model......')
    model = pipe.fit(X_train,y_train)
    
    print('training is done')
    score_train = model.score(X_train,y_train) 
    score_test = model.score(X_test,y_test) 
    score_Holdout = model.score(X_holdout,y_holdout) 
    
    print('training dataset mode1 is: %s'%(str(score_train1)))   
    print('test dataset model is: %s'%(str(score_test1)))
    print('Holdout dataset model1 is: %s'%(str(score_H1)))
    
    # in case needs to use the data to do some visualization or some other metrics
    data_pack = (X_train, y_train, X_test, y_test, X_holdout, y_holdout)
    return model, data_pack

In [None]:
# maybe we can try the neural network?