# Connecting Workspace and Creating Experiment

In [1]:
import azureml.core
from azureml.core import Workspace
ws = Workspace.from_config()

print("Azure ML SDK Version: ", azureml.core.VERSION)
print(ws.name, ws.location, ws.resource_group, sep='\t')

Azure ML SDK Version:  1.7.0
msa-practice	australiaeast	msa-practice


In [2]:
from azureml.core import Experiment

exp_name = "clickbait"

experiment = Experiment(workspace=ws, name=exp_name)

# Load Data and Preprocessing

Load dataset from "clickbait_data.csv" - contains 32000 titles of articles, each assigned a binary label indicating whether it is clickbait (0: not clickbait, 1: clickbait). Each category hsa 16000 titles. Verify that there are no missing values in dataset. Then, preprocess each line by converting each line to lowercase, strip all punctuation, and replace numbers with a identifier.

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv("clickbait_data.csv")
print(df.columns)
df.head(10)

# Check for NA values - none found
print("Empty values:")
print(df.isnull().sum())

Index(['headline', 'clickbait'], dtype='object')
Empty values:
headline     0
clickbait    0
dtype: int64


In [4]:
def process(line):
    import re, string
    
    #To lowercase
    line = line.lower()
    
    # Remove punctuation
    line = line.translate(line.maketrans('','', string.punctuation))
    
    # Replace digits
    line = re.sub('\d+', 'num', line)
    
    return line

# Preprocess each line
for (idx, (headline, _)) in df.iterrows():
    newline = process(headline)
    df.iloc[idx, 0] = newline

df.head(10)

Unnamed: 0,headline,clickbait
0,should i get bings,1
1,which tv female friend group do you belong in,1
2,the new star wars the force awakens trailer is...,1
3,this vine of new york on celebrity big brother...,1
4,a couple did a stunning photo shoot with their...,1
5,how to flirt with queer girls without making a...,1
6,num cute things to distract from your awkward ...,1
7,if disney princesses were from florida,1
8,whats a quote or lyric that best describes you...,1
9,natalie dormer and sam claflin play a game to ...,1


Split dataset into 80% training data and 20% validation data

In [11]:
from sklearn.model_selection import train_test_split

df_x = df.copy()
df_y = df_x.pop('clickbait')
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2)

print(X_train[0:10])
print(X_test[0:10])

# Convert dataframe to numpy array
X_train_original = np.squeeze(X_train.values)
X_test_original = np.squeeze(X_test.values)
y_train = y_train.values
y_test = y_test.values

                                                headline
29038                      san diego votes for new mayor
22599  american war deserter given stay of deportatio...
5009   emily blunt and john krasinski are officially ...
21680  nascar driver jimmie johnson becomes first tim...
2597   this american was shocked when his facebook po...
28379    harper the red ensign of num will fly over vimy
3024         what planet in the star wars galaxy is this
31233        england win second test of the ashes series
18957           read santa clara marks tenth anniversary
31793          boston celtics win numth nba championship
                                                headline
17306        low turnout may mar congo republic election
13994            which snl character matches your zodiac
24615            jefferson to face forward on new nickel
30062     number of homeowners  facing foreclosure rises
8420   heres what happens when you mix beer and mac a...
15650  num inspiring roald dahl

# Training Model

Train a fully-connected 3 layer neural network with Adam optimizer.

In [6]:
def create_model(top_words, title_length):
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers.embeddings import Embedding
    from keras.layers import Flatten

    embedding_vector_length = 32
    model = Sequential()
    model.add(Embedding(top_words, embedding_vector_length, input_length=title_length))
    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    return model

Extract most common words and convert titles into numeric data

In [13]:
# Extract most common words
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing import sequence
#from tensorflow.keras.models import load_model

top_words_array = [100, 200, 500, 1000, 2000, 5000]

for top_words in top_words_array:
    run = experiment.start_logging()
    run.log("top_word_count", top_words)
    
    X_train = X_train_original.copy()
    X_test = X_test_original.copy()
    
    title_length = 25
    
    model = create_model(top_words, title_length)
    tokenizer = Tokenizer(num_words=top_words, filters='', oov_token=True)
    tokenizer.fit_on_texts(X_train)

    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)

    print("{}\n{}\n".format(X_train[0:5], X_test[0:5]))

    # Convert to sequences
    X_train = sequence.pad_sequences(X_train, maxlen=title_length)
    X_test = sequence.pad_sequences(X_test, maxlen=title_length)
    
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)
    
    run.log_list("training_acc", history.history['accuracy'])
    run.log_list("validation_acc", history.history['val_accuracy'])
    run.log_list("training_loss", history.history['loss'])
    run.log_list("validation_loss", history.history['val_loss'])
    
    if (not os.path.exists("models")): os.mkdir("models/")
    
    model_name = "models/model_topwords_" + str(top_words)
    model.save(model_name)
    
    run.complete()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 25, 32)            3200      
_________________________________________________________________
flatten_7 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 16)                12816     
_________________________________________________________________
dense_20 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 17        
Total params: 16,305
Trainable params: 16,305
Non-trainable params: 0
_________________________________________________________________
None
[[1, 1, 1, 9, 22, 1], [1, 1, 1, 1, 1, 6, 1, 4, 1], [1, 1, 10, 1, 1, 14, 1, 1, 1, 1], [1, 1, 1, 1, 1, 54, 70, 1]

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 25, 32)            6400      
_________________________________________________________________
flatten_8 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 16)                12816     
_________________________________________________________________
dense_23 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 17        
Total params: 19,505
Trainable params: 19,505
Non-trainable params: 0
______________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 25, 32)            16000     
_________________________________________________________________
flatten_9 (Flatten)          (None, 800)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 16)                12816     
_________________________________________________________________
dense_26 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 17        
Total params: 29,105
Trainable params: 29,105
Non-trainable params: 0
______________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 25, 32)            32000     
_________________________________________________________________
flatten_10 (Flatten)         (None, 800)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 16)                12816     
_________________________________________________________________
dense_29 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 17        
Total params: 45,105
Trainable params: 45,105
Non-trainable params: 0
_____________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 25, 32)            64000     
_________________________________________________________________
flatten_11 (Flatten)         (None, 800)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 16)                12816     
_________________________________________________________________
dense_32 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_33 (Dense)             (None, 1)                 17        
Total params: 77,105
Trainable params: 77,105
Non-trainable params: 0
_____________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 25, 32)            160000    
_________________________________________________________________
flatten_12 (Flatten)         (None, 800)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 16)                12816     
_________________________________________________________________
dense_35 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_36 (Dense)             (None, 1)                 17        
Total params: 173,105
Trainable params: 173,105
Non-trainable params: 0
___________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25600 samples, validate on 6400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
experiment

# Evaluating Model

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, len(acc)+1)

plt.plot(epochs, acc, '-', label='Training accuracy')
plt.plot(epochs, val_acc, ':', label='Validation accuracy')
plt.title("Accuracy curves")
plt.show()

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs, loss, '-', label="Training loss")
plt.plot(epochs, val_loss, '-', label="Validation loss")
plt.title("Loss curves")
plt.show()

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

# Making Predictions

In [None]:
def preprocess_line(text, tk):
    text = process(text)
    text = tokenizer.texts_to_sequences([text])
    print("Tokenised text: ", text)
    text = sequence.pad_sequences(text, maxlen=title_length)
    return text

def predict_input(text):
    prediction = model.predict(preprocess_line(text, tokenizer))
    output = "Yes" if (prediction > 0.5) else "No"
    print("Likelihood of title clickbait: {}%".format(prediction*100))
    print("Is it clickbait? {}".format(output))

In [None]:
text = input("Enter a title: ")
predict_input(text)