# Explore the BBC News archive

In [None]:
!wget https://www.kaggle.com/competitions/learn-ai-bbc/data/bbc-text.csv

In [6]:
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



with open("./BBC News Train.csv", 'r') as csvfile:
    print(f"First line (header) looks like this:\n\n{csvfile.readline()}")
    print(f"Each data point looks like this:\n\n{csvfile.readline()}")
    # for x in csvfile:
    #   print(x)      
    



First line (header) looks like this:

ArticleId,Text,Category

Each data point looks like this:




### Removing Stopwords

In [7]:
def remove_stopwords(sentence):
  stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
  sentence = sentence.lower()
  words = sentence.split()
  results_words = [word for word in words if word not in stopwords]
  sentence = ' '.join(results_words)
  return sentence

In [8]:
remove_stopwords("I am about to go to the store and get any snack")

'go store get snack'

### Reading the raw data

In [9]:
def parse_data_from_file(filename):
  sentences = []
  labels = []
  with open(filename, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    print(reader)
    next(reader, None)
    for row in reader:
      labels.append(remove_stopwords(row[2]))
      sentences.append(remove_stopwords(row[1]))
  return sentences, labels

In [10]:
sentences, labels = parse_data_from_file("./BBC News Train.csv")

<_csv.reader object at 0x7ff17b5abcf0>


In [11]:
print(sentences[0])



In [12]:
print(len(sentences[1].split()))

209


In [13]:
len(labels),labels[:5]

(1490, ['business', 'business', 'business', 'tech', 'business'])

In [14]:
def fit_tokenizer(sentences):
  tokenizer = Tokenizer(oov_token='')
  tokenizer.fit_on_texts(sentences)
  return tokenizer

In [15]:
tokenizer = fit_tokenizer(sentences)
word_index = tokenizer.word_index
print(f"Vocabulary contains {len(word_index)} words\n")
print(" token included in vocabulary" if "" in word_index else " token NOT included in vocabulary")


Vocabulary contains 24963 words

 token included in vocabulary


In [19]:
def get_padded_sequences(tokenizer, sentences):
  sequences = tokenizer.texts_to_sequences(sentences)
  padded_sequences = pad_sequences(sequences, padding='post')
  return padded_sequences

In [20]:
padded_sequences = get_padded_sequences(tokenizer, sentences)
print(f"First padded sequence looks like this: \n\n{padded_sequences[0]}\n")
print(f"Numpy array of all sequences has shape: {padded_sequences.shape}\n")
print(f"This means there are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}")


First padded sequence looks like this: 

[1322 1180  592 ...    0    0    0]

Numpy array of all sequences has shape: (1490, 1881)

This means there are 1490 sequences in total and each one has a size of 1881


In [22]:
def tokenize_labels(labels):
  label_tokenizer = Tokenizer()
  label_tokenizer.fit_on_texts(labels)
  label_word_index = label_tokenizer.word_index
  label_sequences = label_tokenizer.texts_to_sequences(labels)
  return label_sequences, label_word_index
  

In [23]:
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")

Vocabulary of labels looks like this {'sport': 1, 'business': 2, 'politics': 3, 'entertainment': 4, 'tech': 5}

First ten sequences [[2], [2], [2], [5], [2], [3], [1], [4], [2], [4]]

