<a href="https://colab.research.google.com/github/RamezNabil/TensorFlow-helper-functions-and-blocks/blob/main/TensorFlow_Helper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setting up Kaggle api

In [None]:
!pip install -q kaggle

In [None]:
# Upload your Kaggle API json file
from google.colab import files
files.upload()

In [None]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets list

##Downloading, Unzipping, Saving, and Loading

In [None]:
!kaggle competitions download -c street-view-getting-started-with-julia

In [None]:
# Unzipping the dataset into content/Dataset
!unzip "/content/trainResized.zip" -d Dataset
!unzip "/content/testResized.zip" -d Dataset

In [None]:
# Removing a file from a directory
%rm -rf Dataset/IDC_regular_ps50_idx5

In [None]:
# Making csv file for submission
df = pd.DataFrame({'ID':ids, 'Class':y_pred})
df.to_csv(r'submission.csv', index = False)

## Pre-processing

In [None]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility

## Sklearn

In [None]:
# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Transforming y to numbered labels
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(sorted(data_labels['Class'].unique())) # takes sorted list of unique class labels
y = le.transform(y)

In [None]:
# Naive Bayes with tfidf vectorizer (used as a baseline in nlp)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

# Evaluation
baseline_score = model_0.score(val_sentences, val_labels)

# Making predictions
baseline_preds = model_0.predict(val_sentences)

## Vectorization

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [None]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [None]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()

## Embedding

* input_dim - The size of the vocabulary (e.g. len(text_vectorizer.get_vocabulary()).
* output_dim - The size of the output embedding vector, for example, a value of 100 outputs a feature vector of size 100 for each word.
* embeddings_initializer - How to initialize the embeddings matrix, default is "uniform" which randomly initalizes embedding matrix with uniform distribution. This can be changed for using pre-learned embeddings.
* input_length - Length of sequences being passed to embedding layer.

***Note: Embedding takes the output of the vectorizer as an input***

In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length) # how long is each input

In [None]:
# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))

## Fine-tuning

In [None]:
# Unfreezing some layers for fine-tuning
base_model.trainable = True
for layer in base_model.layers[:-10]:
  layer.trainable = False

# Fit the model
history_finetune = model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=10, initial_epoch = history.epoch[-1])

In [None]:
# plotting val accuracy before and after fine-tuning with a seperating line
val_acc = history.history["val_accuracy"]
total_val_acc = val_acc + history_finetune.history["val_accuracy"]

plt.plot(total_val_acc, label='Validation Accuracy')
plt.plot([4, 4], plt.ylim(), label='Start Fine Tuning')
plt.legend(loc='lower right')
plt.xlabel('Epochs')
plt.show()

## Callbacks

In [None]:
def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instance to store log files.
  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"
  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

## Visualization

In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
# How long of a sentence covers 95% of the lengths?
output_seq_len = int(np.percentile(sent_lens, 95)