As usual we start loading the packages that we will use in our notebook

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 


In [2]:
#PRINT VERSION!!
tf.__version__

'2.0.0'

In [3]:
train_df = pd.read_csv("train.csv")#here we have the dataset we extracted
train_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
Y = train_df['label']

In [5]:
group1 = pd.DataFrame(train_df.groupby(['author']).size()).reset_index()
group1.head()

Unnamed: 0,author,0
0,# 1 NWO Hatr,17
1,-NO AUTHOR-,54
2,10 Habits That Will Make Your Life Easier &amp...,1
3,10 More Beautiful Images That Remind You We St...,1
4,10 Movies That Could Change Your Understanding...,1


In [6]:
group2 = pd.DataFrame(train_df.groupby(['author'])['label'].sum()).reset_index()
group2.head()

Unnamed: 0,author,label
0,# 1 NWO Hatr,17
1,-NO AUTHOR-,54
2,10 Habits That Will Make Your Life Easier &amp...,1
3,10 More Beautiful Images That Remind You We St...,1
4,10 Movies That Could Change Your Understanding...,1


In [7]:
group1.columns = ['author','count']
group1.sort_values(by=['count'], ascending=False).head()

Unnamed: 0,author,count
2944,Pam Key,243
3929,admin,193
1762,Jerome Hudson,166
724,Charlie Spiering,141
1857,John Hayward,140


In [8]:
group1[group1['author']=='Starkman']

Unnamed: 0,author,count
3518,Starkman,84


In [9]:
group2.sort_values(by=['label'], ascending=False).head()

Unnamed: 0,author,label
3929,admin,193
2939,Pakalert,86
1111,Eddy Lavine,85
3518,Starkman,84
1376,Gillian,82


In [10]:
merge_groups = pd.merge(group1,group2, on='author')
merge_groups.head()

Unnamed: 0,author,count,label
0,# 1 NWO Hatr,17,17
1,-NO AUTHOR-,54,54
2,10 Habits That Will Make Your Life Easier &amp...,1,1
3,10 More Beautiful Images That Remind You We St...,1,1
4,10 Movies That Could Change Your Understanding...,1,1


In [11]:
merge_groups['prob_fake'] = merge_groups['label']/merge_groups['count']
merge_groups['prob_fake'].describe()

count    4201.000000
mean        0.470771
std         0.499016
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: prob_fake, dtype: float64

In [12]:
#Starkman
#Pam Key

merge_groups[merge_groups['author']=='Pam Key']['prob_fake']

2944    0.004115
Name: prob_fake, dtype: float64

In [13]:
len(train_df['author'].unique()) #we print the length, not a big one but sufficient

4202

In [14]:
train_df['title_lower'] = train_df["title"].str.lower()
train_df['title_no_punctuation'] = train_df['title_lower'].str.replace('[^\w\s]','')
train_df['title_no_punctuation'] = train_df["title_no_punctuation"].fillna("fillna")

In [15]:
train_df['text_lower'] = train_df["text"].str.lower()
train_df['text_no_punctuation'] = train_df['text_lower'].str.replace('[^\w\s]','')
train_df['text_no_punctuation'] = train_df["text_no_punctuation"].fillna("fillna")

In [16]:
train_df['author_lower'] = train_df["author"].str.lower()
train_df['author_no_spaces'] = train_df['author_lower'].str.replace(' ','_')


In [17]:
train_df['author_no_spaces'].head() #in this way we can treat each author as a word.

0         darrell_lucus
1       daniel_j._flynn
2    consortiumnews.com
3       jessica_purkiss
4        howard_portnoy
Name: author_no_spaces, dtype: object

In [18]:
max_features=5000 #we set maximum number of words to 5000
maxlen=400 #we set maximum sequence length to 400

In [19]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step

In [20]:
tok.fit_on_texts(list(train_df['text_no_punctuation'])+list(train_df['title_no_punctuation'])+list(train_df['author_no_spaces'].astype(str))) #fit to cleaned text


In [21]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

216068


In [22]:
text_df = tok.texts_to_sequences(list(train_df['text_no_punctuation'])) #this is how we create sequences
text_df = tf.keras.preprocessing.sequence.pad_sequences(text_df, maxlen=maxlen) #let's execute pad step

In [23]:
title_df = tok.texts_to_sequences(list(train_df['title_no_punctuation'])) #this is how we create sequences
title_df = tf.keras.preprocessing.sequence.pad_sequences(title_df, maxlen=maxlen)

In [24]:
author_df = tok.texts_to_sequences(list(train_df['author_no_spaces'].astype(str))) #this is how we create sequences
author_df = tf.keras.preprocessing.sequence.pad_sequences(author_df, maxlen=maxlen)

In [25]:
train_df = author_df #title_df + text_df  

THIS WILL BE THE INPUT FOR OUR EMBEDDING!! IN THE NEXT VIDEO WE WILL TRAIN THE MODEL ...

In [26]:
from sklearn.model_selection import train_test_split #divide into train and test set

In [27]:
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [28]:
embedding_dim = 50 #this is the final dimension of the embedding space.


Let's write down the model

In [29]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, #embedding input
                           output_dim=embedding_dim,#embedding output
                           input_length=maxlen), #maximum length of an input sequence
  tf.keras.layers.Flatten(), #flatten layer

  tf.keras.layers.Dense(1, activation=tf.nn.sigmoid) #no more softmax

])

In [30]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',#no more categorical_crossentropy
              metrics=['accuracy'])



In [31]:
model.summary() #here we show the architecture 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 50)           10803450  
_________________________________________________________________
flatten (Flatten)            (None, 20000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 20001     
Total params: 10,823,451
Trainable params: 10,823,451
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.fit(np.array(X_train), np.array(y_train), epochs=1) #let's fit the model

Train on 18720 samples


<tensorflow.python.keras.callbacks.History at 0x1c63eb82ef0>

In [33]:
#results text, title and author [0.1477214220767984, 0.9447115384615384]
#results text, title [0.13193302869510193, 0.9461538461538461]
#results author [0.3442320129046073, 0.8211538461538461]
model.evaluate(np.array(X_test), np.array(y_test)) 



[0.37038334493453684, 0.8129808]