# **Extracting Bag of Words (BoW) Features from Course Textual Content**


In [2]:
import gensim
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora

%matplotlib inline

Download stopwords


In [3]:
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

In [4]:
# also set a random state
rs = 123

### Bag of Words (BoW) features


### BoW dimensionality reduction


In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
#stop_words

In [21]:
course_url = "course_processed.csv"
course_content_df = pd.read_csv(course_url)

In [22]:
course_content_df.iloc[0, :]

COURSE_ID                                               ML0201EN
TITLE          robots are coming  build iot apps with watson ...
DESCRIPTION    have fun with iot and learn along the way  if ...
Name: 0, dtype: object

The course content dataset has three columns `COURSE_ID`, `TITLE`, and `DESCRIPTION`. `TITLE` and `DESCRIPTION` are all text upon which we want to extract BoW features. 


Let's join those two text columns together.


In [23]:
# Merge TITLE and DESCRIPTION title
course_content_df['course_texts'] = course_content_df[['TITLE', 'DESCRIPTION']].agg(' '.join, axis=1)
course_content_df = course_content_df.reset_index()
course_content_df['index'] = course_content_df.index

In [24]:
course_content_df.iloc[0, :]

index                                                           0
COURSE_ID                                                ML0201EN
TITLE           robots are coming  build iot apps with watson ...
DESCRIPTION     have fun with iot and learn along the way  if ...
course_texts    robots are coming  build iot apps with watson ...
Name: 0, dtype: object

We have used the `tokenize_course()` method  to tokenize the course content:


In [25]:
def tokenize_course(course, keep_only_nouns=True):
    # Get English stop words
    stop_words = set(stopwords.words('english'))
    # Tokenize the course text
    word_tokens = word_tokenize(course)
    # Remove English stop words and numbers
    word_tokens = [w for w in word_tokens if (not w.lower() in stop_words) and (not w.isnumeric())]
    # Only keep nouns 
    if keep_only_nouns:
        # Define a filter list of non-noun POS tags
        filter_list = ['WDT', 'WP', 'WRB', 'FW', 'IN', 'JJR', 'JJS', 'MD', 'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS',
                       'RP']
        # Tag the word tokens with POS tags
        tags = nltk.pos_tag(word_tokens)
        # Filter out non-nouns based on POS tags
        word_tokens = [word for word, pos in tags if pos not in filter_list]

    return word_tokens

Let's try it on the first course.


In [26]:
a_course = course_content_df.iloc[0, :]['course_texts']
a_course

'robots are coming  build iot apps with watson  swift  and node red have fun with iot and learn along the way  if you re a swift developer and want to learn more about iot and watson ai services in the cloud  raspberry pi   and node red  you ve found the right place  you ll build iot apps to read temperature data  take pictures with a raspcam  use ai to recognize the objects in those pictures  and program an irobot create 2 robot  '

In [27]:
#tokenize_course(a_course)

Using provided tokenize_course() method to tokenize all courses in courses_df['course_texts']._


In [28]:
course_content_df.iloc[0, :]['course_texts']

'robots are coming  build iot apps with watson  swift  and node red have fun with iot and learn along the way  if you re a swift developer and want to learn more about iot and watson ai services in the cloud  raspberry pi   and node red  you ve found the right place  you ll build iot apps to read temperature data  take pictures with a raspcam  use ai to recognize the objects in those pictures  and program an irobot create 2 robot  '

In [29]:
tokenized_courses= []
for i in range(course_content_df.shape[0]):
    text = course_content_df.iloc[i, :]['course_texts']
    tokenized_courses.append(tokenize_course(text, True))
    

Then we need to create a token dictionary `tokens_dict`


In [30]:
tokens_dict =  gensim.corpora.Dictionary(tokenized_courses)

Then we can use `doc2bow()` method to generate BoW features for each tokenized course.


In [31]:
bow_docs =[]
for course in tokenized_courses:
    bow_docs.append(tokens_dict.doc2bow(course))

Lastly, we need to append the BoW features for each course into a new BoW dataframe. The new dataframe needs to include the following columns  :
- 'doc_index': the course index starting from 0
- 'doc_id': the actual course id such as `ML0201EN`
- 'token': the tokens for each course
- 'bow': the bow value for each token


In [32]:
 # Enumerate through each course and its bag-of-words representation
doc_indices = []
doc_ids = []
tokens = []
bow_values = []


for doc_index, doc_bow in enumerate(bow_docs):

    for token_index, token_bow in doc_bow:
        doc_indices.append(doc_index)
        doc_ids.append(course_content_df['COURSE_ID'][doc_index])
    
        # Retrieve the token from the tokens dictionary based on its index
        token = tokens_dict.get(token_index)
        tokens.append(token)
        bow_values.append(token_bow)
        
bow_dicts = {"doc_index": doc_indices,
            "doc_id": doc_ids,
            "token": tokens,
            "bow": bow_values}

pd.DataFrame(bow_dicts).head()

Unnamed: 0,doc_index,doc_id,token,bow
0,0,ML0201EN,ai,2
1,0,ML0201EN,apps,2
2,0,ML0201EN,build,2
3,0,ML0201EN,cloud,1
4,0,ML0201EN,coming,1
