<a href="https://colab.research.google.com/github/Otobi1/Back-to-Basics-A-Refresher-/blob/master/Back_to_Basics_CNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
## Set up 

import numpy as np 
import pandas as pd
import random 
import torch 
import torch.nn as nn

In [26]:
SEED = 1234

In [27]:
def set_seeds(seed = 1234):
  """Set seed for reproducibility."""
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed) # multi GPU

In [28]:
# Set seed for reproducibility 

set_seeds(seed=SEED)

In [29]:
# Set device 

cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
  torch.set_default_tensor_type("torch.cuda.FloatTensor")
print (device)

cuda


In [30]:
# Load data 
# - corpus of news article from http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html 

url = "https://raw.githubusercontent.com/GokuMohandas/madewithml/main/datasets/news.csv"
df = pd.read_csv(url, header = 0) # load
df = df.sample(frac = 1).reset_index(drop = True) # shuffle
df.head()

Unnamed: 0,title,category
0,Sharon Accepts Plan to Reduce Gaza Army Operat...,World
1,Internet Key Battleground in Wildlife Crime Fight,Sci/Tech
2,July Durable Good Orders Rise 1.7 Percent,Business
3,Growing Signs of a Slowing on Wall Street,Business
4,The New Faces of Reality TV,World


In [31]:
# Preprocessing 
# - to clean up the data, convert to lower text, remove filler words and filter using regex.

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [32]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we']


In [33]:
def preprocess(text, stopwords = STOPWORDS):
  """Conditional preprocessing on our text unique to the task."""
  # Lower 
  text = text.lower()

  # Remove stopwords
  pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
  text = pattern.sub(" ", text)

  # Remove words in parenthesis
  text = re.sub(r"\([^)]*\)", " ", text)

  # Spacing and filters 
  text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
  text = re.sub('[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric characters
  text = re.sub(' +', ' ', text) # remove multiple spaces 
  text = text.strip()

  return text

In [34]:
# Sample 

text = "Great week for the NYSE"
preprocess(text = text)

'great week nyse'

In [35]:
# Apply to dataframe

preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print (f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")

Sharon Accepts Plan to Reduce Gaza Army Operation, Haaretz Says

sharon accepts plan reduce gaza army operation haaretz says


In [36]:
# if you have preprocessing steps like standardisation, that are calculated, you need to separate the training an dtest set first before applying those operations. 
# this is because we cannot apply any knowledge gained from the test set accidentally (data leak during preprocessing/training). 
# - for global preprocessing steps like the functin above, where we arent learning anything from the data itself, we can perform them before splitting the data.

In [37]:
# Split the data 

import collections
from sklearn.model_selection import train_test_split

In [38]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [39]:
def train_val_test_split(X, y, train_size):
  """Split dataset into data split."""
  X_train, X_, y_train, y_ = train_test_split(X, y, train_size = TRAIN_SIZE, stratify = y)
  X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size = 0.5, stratify = y_)
  return X_train, X_val, X_test, y_train, y_val, y_test

In [40]:
# Data
X = preprocessed_df["title"].values 
y = preprocessed_df["category"].values

In [41]:
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X = X, y = y, train_size = TRAIN_SIZE)

print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_train: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} -> {y_train[0]}")


X_train: (84000,), y_train: (84000,)
X_val: (18000,), y_val: (18000,)
X_train: (18000,), y_test: (18000,)
Sample point: china battles north korea nuclear talks -> World


In [42]:
# Label Encoding 
# to encode the text labels into unique indices

import itertools

In [43]:
class LabelEncoder(object):
  """Label encoder for tag labels."""
  def __init__(self, class_to_index = {}):
    self.class_to_index = class_to_index
    self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    self.classes = list(self.class_to_index.keys())

  def __len__(self):
    return len(self.class_to_index)

  def __str__(self):
    return f"<LabelEncoder(num_classes = {len(self)})>"
  
  def fit(self, y):
    classes = np.unique(y_train)
    for i, class_ in enumerate(classes):
      self.class_to_index[class_] = i
    self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    self.classes = list(self.class_to_index.keys())
    return self

  def encode(self, y):
    encoded = np.zeros((len(y)), dtype = int)
    for i, item in enumerate(y):
      encoded[i] = self.class_to_index[item]
    return encoded

  def decode(self, y):
    classes = []
    for i, item in enumerate(y):
      classes.append(self.index_to_class[item])
    return classes

  def save(self, fp):
    with open(fp, "w") as fp:
      contents = {"class_to_index": self.class_to_index}
      json.dump(contents, fp, indent = 4, sort_keys = False)

  @classmethod
  def load(cls, fp):
    with open(fp, "r") as fp:
      kwargs = json.load(fp = fp)
    return clas(**kwargs)

In [44]:
# Encode 

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

In [45]:
# Converting labels to tokens 

print (f"y_train[0]: {y_train[0]}")

y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)

print (f"y_train[0]: {y_train[0]}")

y_train[0]: World
y_train[0]: 3


In [46]:
# Class weights 

counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"counts: {counts}\nweights: {class_weights}")

counts: [21000 21000 21000 21000]
weights: {0: 4.761904761904762e-05, 1: 4.761904761904762e-05, 2: 4.761904761904762e-05, 3: 4.761904761904762e-05}
