In [None]:
import sys

# Select path to current folder and split by \\
main_path = sys.path[0].split("\\")

# Asssign path to parent folder
# path_to_parent allows access to any folder from within parent folder, no matter the location of this file within the parent folder
# i.e.: Don't need to specify "../" x amount of times
path_to_parent = []
for element in main_path:
    path_to_parent.append(element)
    if "Fake_Users_Movies_Classifier" == element:
        break

path_to_parent = "\\".join(path_to_parent)

# Add path to feature generation folder
sys.path.append(path_to_parent+"\\feature_generation")

In [None]:
# Import feature generator for week 4
from feature_gen_wk4 import feature_gen
import pandas as pd

# Initialize a list with file names
# Clarification:
#       The file names are only the first word within the file name, as this is the only difference between all files
file_names = ["first","second","third", "fourth"]
# Create and initialize a feature generation class
feature_generator = feature_gen()

# Initialize a variable which will be used to store the entire dataset
df_final = None

# Loop through the filee names and generate dataframe with features
for name in file_names:
    # Create string path to labelled data
    path_to_file = path_to_parent + f"/data/labelled_data/{name}_batch_with_labels_likes.npz"
    # Generate features from file
    df = feature_generator.retrieveAndGenerate(path_to_file)

    # Check if df_final is not a NoneType (therefore does not have any data inside)
    if type(df_final) != None:
        # If no, concatenate the two dataframes
        df_final = pd.concat([df_final, df]).reset_index(drop=True)
    else:
        # If yes, assign the generated dataframe to df_final
        df_final = df

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

# Retrieve labels and assign to y
y = df_final['label']
# Remove labels and users from dataset and assign to x
X = df_final.drop(['user','label'],axis=1)

# Set the value of the amount of features which will be input to the model
input_shape = len(X.columns)

# Set the random state for train_test_split
random_state = 42

# Split the data into 3 set: train, validation and test with ratios: 0.765 : 0.117 : 0.117 respectively
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.235, random_state=random_state, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val, test_size=0.5, random_state=random_state, stratify=y_val)

# Apply scaler on train, validation and test sets
scaler = StandardScaler()
X_train_scaled, X_val_scaled, X_test_scaled = scaler.fit_transform(X_train), scaler.transform(X_val), scaler.transform(X_test)

# Calculate the ratio of 1 labels (fake) to 0 labels (non-fake) and multiply by a set amount
# This value will be used as the weight applied to the 0 label during weighting to prevent label bias
zero_weight = (np.sum(y_train==1)/np.sum(y_train==0))*1.5

# Create model using tensorflow sequential class
model = tf.keras.Sequential([
    # Initialize first layer using GlorotUniform, assign 300 neurons and set regularizers to optimal value
    # In this implementation ReLU activation was used as it performed the best, based on internal testing
    tf.keras.layers.Dense(300, activation='relu',
              input_shape=(input_shape,),
              kernel_regularizer=tf.keras.regularizers.L1(0.001),
              activity_regularizer=tf.keras.regularizers.L2(0.0015),
              kernel_initializer=tf.keras.initializers.GlorotUniform()
              ),
    # Apply Dropout with 0.5 dropout rate to prevent overfitting
    tf.keras.layers.Dropout(0.5),
    # Initialize output layer using GlorotUniform, with sigmoid activation
    tf.keras.layers.Dense(2, activation='sigmoid',
              kernel_initializer=tf.keras.initializers.GlorotUniform(),
                          ),
])

# Convert target labels to one-hot encoding
y_train_categorized, y_val_categorized = to_categorical(y_train, num_classes=2), to_categorical(y_val, num_classes=2)

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)

# Define a callback to save checkpoints
checkpoint_filepath = path_to_parent + "/model_checkpoints/wk4/checkpoint_{epoch:1d}.h5"

# Save a model checkpoint every 2 epochs
model_save = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    period=2,  # Save every 2 epochs
)

# Compile the model with Adam optimizer with a learning rate of 0.0004
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.AUC()])
# Set the amount of epochs to train for
epochs = 20

# Train the model
history = model.fit(X_train_scaled, y_train_categorized, epochs=epochs, batch_size=32, class_weight={0:zero_weight, 1:1}, validation_data=(X_val_scaled, y_val_categorized), callbacks=[early_stopping, model_save])

In [None]:
# Load class with getScores function for calulating predictions scores
from feature_selection import selectors

# Initialize list for storing model prediction scores
scores = []

try:
  # Loop through all model checkpoints
  # Start = 0
  # End = Number of epochs / epoch_save_period + 1
  for i in range(1,int(epochs/2)+1):
    path = path_to_parent + f'/model_checkpoints/wk4/checkpoint_{i*2}.h5'

    # Load model weights from path and predict test set labels
    model.load_weights(path)
    y_pred = model.predict(X_test_scaled)[:,1]

    # Calculate prediction scores and add the dictionary to the list
    score = selectors.getScores(y_test, y_pred)
    score['checkpoint'] = i*2
    scores.append(score)
except:
  None

In [None]:
#Display scores of each checkpoint
scores = pd.DataFrame(scores).sort_values(by='AUC', ascending=False)
scores.head()