# 1. Retrieving data from the Aalto Open Courses API

In [73]:
import os
from dotenv import load_dotenv
import json

load_dotenv()

api_key = os.getenv('COURSE_API_KEY')


In [74]:
import requests

# if data file exists, read it
if os.path.isfile('data/data.json'):
    with open('data/data.json', 'r') as f:
        raw_data = json.load(f)
else:
    # if not, get data from API
    url = 'https://course.api.aalto.fi:443/api/sisu/v1/courseunitrealisations?user_key=' + api_key
    r = requests.get(url)

    # get data
    raw_data = r.json()

    # remove the courses that have languageOfInstruction = 'en'
    print("There are", len(raw_data), "courses in total.")

    en_data = [course for course in raw_data if 'en' in course['languageOfInstructionCodes']]
    not_en_data = [course for course in raw_data if 'en' not in course['languageOfInstructionCodes']]

    print("There are", len(en_data), "courses in English.")
    print("There are", len(not_en_data), "courses not in English.")

    # save data into a json file
    with open('data/data.json', 'w') as f:
        json.dump(en_data, f)

    raw_data = en_data

In [75]:
import pandas as pd
import re

# add data into a dataframe without the summary column
courses = pd.DataFrame(raw_data)
del courses['summary']
del courses['languageOfInstructionCodes']

# extract content and learning outcomes and add them as new columns
courses['content'] = [course['summary']['content']['en']
                      for course in raw_data]
courses['learningOutcomes'] = [course['summary']
                               ['learningOutcomes']['en'] for course in raw_data]

# fix other fields
courses['name'] = [course['name']['en'] for course in raw_data]
courses['credits'] = [course['credits']['max'] for course in raw_data]
courses.rename(columns={'credits': 'maxcredits'}, inplace=True)
courses['organizationName'] = [course['organizationName']['en']
                               for course in raw_data]
courses['organizations'] = [course['organizations'][0]['organisationId']
                            for course in raw_data]
del courses['studySubGroups']

# extract some other useful information
courses['teachingPeriod'] = [course['summary']['teachingPeriod']['en']
                             for course in raw_data]
courses['linkToCourse'] = [course['summary']['additionalInformation']['en']
                           for course in raw_data]
# TODO extract the http link from the linkToCourse column
courses['linkToCourse'] = re.search(
    "(?P<url>https?://[^\s]+)", str(courses['linkToCourse'])).group("url")

# reorganize the columns
courses = courses[['id', 'code', 'courseUnitId', 'type', 'name', 'content', 'learningOutcomes', 'teachers', 'teachingPeriod', 'startDate', 'endDate',
                   'mincredits', 'maxcredits', 'enrolmentStartDate', 'enrolmentEndDate', 'organizationId', 'organizationName', 'organizations', 'linkToCourse']]

# fix type of all columns
courses['id'] = courses['id'].astype(str)
courses['code'] = courses['code'].astype(str)
courses['type'] = courses['type'].astype(str)
courses['name'] = courses['name'].astype(str)
courses['content'] = courses['content'].astype(str)
courses['learningOutcomes'] = courses['learningOutcomes'].astype(str)
courses['teachers'] = courses['teachers'].astype(str)
courses['teachingPeriod'] = courses['teachingPeriod'].astype(str)
courses['startDate'] = pd.to_datetime(courses['startDate'])
courses['endDate'] = pd.to_datetime(courses['endDate'])
courses['mincredits'] = courses['mincredits'].astype(int)
courses['maxcredits'] = courses['maxcredits'].astype(int)
courses['enrolmentStartDate'] = pd.to_datetime(courses['enrolmentStartDate'])
courses['enrolmentEndDate'] = pd.to_datetime(courses['enrolmentEndDate'])
courses['organizationId'] = courses['organizationId'].astype(str)
courses['organizationName'] = courses['organizationName'].astype(str)
courses['organizations'] = courses['organizations'].astype(str)
courses['linkToCourse'] = courses['linkToCourse'].astype(str)

courses.head()


Unnamed: 0,id,code,courseUnitId,type,name,content,learningOutcomes,teachers,teachingPeriod,startDate,endDate,mincredits,maxcredits,enrolmentStartDate,enrolmentEndDate,organizationId,organizationName,organizations,linkToCourse
0,aalto-CUR-162063-3082770,LC-1117,aalto-OPINKOHD-1117673055-20210801,teaching-participation-lectures,"Integrated Oral and Written Skills (o,w) (H06 ...",This course introduces written and oral commun...,"Upon completion of this course, the students w...",['Hanna Liisa Hakala'],"2020-2021 Autumn I-II,Spring III-IV,Spring IV-...",2023-01-11,2023-02-15,3,3,2022-12-12,2023-01-02,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
1,aalto-CUR-162064-3082771,LC-1117,aalto-OPINKOHD-1117673055-20210801,teaching-participation-lectures,"Integrated Oral and Written Skills (o,w) (H07 ...",This course introduces written and oral commun...,"Upon completion of this course, the students w...",['Jenni Maria Korvala'],"2020-2021 Autumn I-II,Spring III-IV,Spring IV-...",2023-01-11,2023-03-29,3,3,2022-12-12,2023-01-10,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
2,aalto-CUR-162065-3082772,LC-1117,aalto-OPINKOHD-1117673055-20210801,teaching-participation-lectures,"Integrated Oral and Written Skills (o,w), Lect...",This course introduces written and oral commun...,"Upon completion of this course, the students w...",['Maxi-Ann Marie A Campbell'],"2020-2021 Autumn I-II,Spring III-IV,Spring IV-...",2023-01-17,2023-04-04,3,3,2022-12-12,2023-01-12,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
3,aalto-CUR-162066-3082773,LC-1117,aalto-OPINKOHD-1117673055-20210801,teaching-participation-lectures,"Integrated Oral and Written Skills (o,w) (BSc ...",This course introduces written and oral commun...,"Upon completion of this course, the students w...",['Riina Marketta Seppälä'],"2020-2021 Autumn I-II,Spring III-IV,Spring IV-...",2023-03-03,2023-05-19,3,3,2023-01-30,2023-02-20,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
4,aalto-CUR-162068-3082775,LC-1117,aalto-OPINKOHD-1117673055-20210801,teaching-participation-lectures,"Integrated Oral and Written Skills (o,w) (Int...",This course introduces written and oral commun...,"Upon completion of this course, the students w...","['Susan Katariina Gamache', 'Malachy James Hal...","2020-2021 Autumn I-II,Spring III-IV,Spring IV-...",2023-04-25,2023-05-23,3,3,2023-03-27,2023-04-17,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...


In [82]:
# number of courses with unique code
# print(len(courses["code"].unique())) # 985

# number of courses with unique courseUnitId
# print(len(courses["courseUnitId"].unique())) # 993

# measure difference between two fields
# unique_courseUnitId = courses["courseUnitId"].unique()
# unique_code = courses["code"].unique()

unique_courseUnitId = courses.drop_duplicates(subset=["courseUnitId"], keep="first")
unique_code = courses.drop_duplicates(subset=["code"], keep="first")

# difference of ids
id_difference = list(set(unique_courseUnitId["id"]) - set(unique_code["id"]))

# different courses with different filtering (code, unitId)
courses[courses["id"].isin(id_difference)]

Unnamed: 0,id,code,courseUnitId,type,name,content,learningOutcomes,teachers,teachingPeriod,startDate,endDate,mincredits,maxcredits,enrolmentStartDate,enrolmentEndDate,organizationId,organizationName,organizations,linkToCourse
518,aalto-CUR-164698-3085405,MS-A0001,aalto-OPINKOHD-1112895518-20210801,exam-exam,"Matrix Algebra, Exam","Vector computations, matrices and systems of l...",After the course the student - can write syste...,['Harri Heimo Petteri Hakula'],"2020-2021 Autumn II, 2021-2022 Autumn II",2023-02-24,2023-02-24,5,5,2022-12-26,2023-02-17,T302,Department of Mathematics and Systems Analysis,aalto-a3b50b1e-ed7d-4136-8897-c7aa1a4f899b,https://mycourses.aalto.fi/co...
606,aalto-CUR-165469-3086176,MS-C1541,aalto-OPINKOHD-1142267517-20210801,teaching-participation-lectures,"Metric Spaces, Lecture","real numbers, metric, norm, inner product, ope...",After passing the course the student knows - m...,['Kalle Perttu Juhana Kytölä'],"2020-2021 Spring III, 2021-2022 Spring III",2023-01-09,2023-02-23,5,5,2022-12-12,2023-01-02,T302,Department of Mathematics and Systems Analysis,aalto-a3b50b1e-ed7d-4136-8897-c7aa1a4f899b,https://mycourses.aalto.fi/co...
622,aalto-CUR-165738-3086445,PHYS-E0525,otm-ebad0f6d-8023-47e2-bfe1-724ca4b10653,exam-exam,"Microscopy of Nanomaterials D, Exam",The course gives basic knowledge of the micros...,"After the course, students will understand the...","['Janne Tapio Ruokolainen', 'Hua Jiang']",<p> Teaching Language : English</p><p> Teachin...,2023-04-18,2023-04-18,5,5,2023-02-17,2023-04-11,T304,Department of Applied Physics,aalto-7b8d166f-e80e-4481-838e-72098c5ea180,https://mycourses.aalto.fi/co...
629,aalto-CUR-165886-3086593,PHYS-C0256,otm-4cb03484-428b-4232-8db0-16dfff5ed692,exam-exam,"Thermodynamics and Statistical Physics, Exam",The basic concepts and assumptions of statisti...,After the course the student\r\n<ol><li>Can ex...,['Jukka Pekka Pekola'],<p> Teaching Language : English</p><p> Teachin...,2023-02-22,2023-02-22,5,5,2022-12-24,2023-02-15,T304,Department of Applied Physics,aalto-7b8d166f-e80e-4481-838e-72098c5ea180,https://mycourses.aalto.fi/co...
633,aalto-CUR-165900-3086607,PHYS-E055103,otm-38943da8-7b1d-408a-855f-201f0fea96e8,exam-exam,"Low Temperature Physics D, Superconductivity, ...",The Bardeen-Cooper-Schrieffer theory of superc...,The students will get a basic understanding of...,['Vladimir Eltsov'],"<p>In 2022-2024, this course is available as a...",2023-04-18,2023-04-18,5,6,2023-02-17,2023-04-11,T304,Department of Applied Physics,aalto-7b8d166f-e80e-4481-838e-72098c5ea180,https://mycourses.aalto.fi/co...
736,aalto-CUR-166588-3087295,NBE-E4100,otm-9881eb6c-844e-4431-a1cf-8327dca88257,exam-exam,"Molecular Biophysics D, Exam",<ul><li>Fundamentals of biomolecules: Biopolym...,After passing the course the student is able t...,['Anton Kuzyk'],<p> Teaching Language : English</p><p> Teachin...,2023-05-31,2023-05-31,5,5,2023-04-01,2023-05-24,T314,Department of Neuroscience and Biomedical Engi...,aalto-1311f55a-509b-485d-b974-5ddaf28ffaa8,https://mycourses.aalto.fi/co...
1272,aalto-CUR-170151-2411704,PHYS-C0252,otm-7cef259b-e7cd-4495-a080-01c0d58cf8cc,exam-exam,"Quantum Mechanics, Exam","Hilbert space and Dirac notation; Operators, e...",After completing the course the student\r\n<ol...,"['Mikko Pentti Matias Möttönen', 'Tapio Ala-Ni...",<p> Teaching Language : English</p><p> Teachin...,2023-06-08,2023-06-08,5,5,2023-04-09,2023-06-01,T304,Department of Applied Physics,aalto-7b8d166f-e80e-4481-838e-72098c5ea180,https://mycourses.aalto.fi/co...
1275,aalto-CUR-170160-2412218,PHYS-C0254,otm-54fd870b-cc18-4a73-9bee-eeb5acf2b0d0,exam-exam,"Quantum Circuits, Exam",The physical foundations and implementation of...,"After completing this course, the student is a...","['Gheorghe-Sorin Paraoanu', 'Jan Goetz']",<p> Teaching Language : English</p><p> Teachin...,2023-04-20,2023-04-20,5,5,2023-02-19,2023-04-13,T304,Department of Applied Physics,aalto-7b8d166f-e80e-4481-838e-72098c5ea180,https://mycourses.aalto.fi/co...


In [77]:
MS_C1541_exams = courses[(courses["code"] == "MS-C1541") & (courses["type"] == "exam-exam")]["content"]
MS_C1541_teachings = courses[(courses["code"] == "MS-C1541") & (courses["type"] == "teaching-participation-lectures")]["content"]

# exam course contents
print(MS_C1541_exams[603])
# teaching course contents
print(MS_C1541_teachings[606])

assert MS_C1541_exams[603] == MS_C1541_teachings[606]

real numbers, metric, norm, inner product, open and closed sets, continuous mappings, sequences and limits, compactness, completeness, connectedness.
real numbers, metric, norm, inner product, open and closed sets, continuous mappings, sequences and limits, compactness, completeness, connectedness.


In [78]:
# average length of course contents
print("Average length of course contents:", courses["content"].apply(len).mean())

# average length of learning outcomes
print("Average length of learning outcomes:", courses["learningOutcomes"].apply(len).mean())

Average length of course contents: 461.59348441926346
Average length of learning outcomes: 472.3201133144476


After this findings, we decided to filter the data using the code field. We will also use the learning outcomes field combined with the content to create a new field that will be used for the recommender system.

In [87]:
courses = courses.drop_duplicates(subset=["code"], keep="first")
# re-index the dataframe
courses = courses.reset_index(drop=True)

courses

Unnamed: 0,id,code,courseUnitId,type,name,content,learningOutcomes,teachers,teachingPeriod,startDate,endDate,mincredits,maxcredits,enrolmentStartDate,enrolmentEndDate,organizationId,organizationName,organizations,linkToCourse
0,aalto-CUR-162063-3082770,LC-1117,aalto-OPINKOHD-1117673055-20210801,teaching-participation-lectures,"Integrated Oral and Written Skills (o,w) (H06 ...",This course introduces written and oral commun...,"Upon completion of this course, the students w...",['Hanna Liisa Hakala'],"2020-2021 Autumn I-II,Spring III-IV,Spring IV-...",2023-01-11,2023-02-15,3,3,2022-12-12,2023-01-02,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
1,aalto-CUR-162078-3082785,LC-1310,aalto-OPINKOHD-1117673220-20210801,teaching-participation-lectures,"Academic Communication for MSc Students (o,w),...",The course is intended for students in master&...,"Upon completion of this course, students will ...",['Jan-Mikael Rybicki'],"2020-2021 Autumn I-II,Spring III-IV , 2021-202...",2023-01-13,2023-04-21,3,3,2022-12-12,2023-01-12,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
2,aalto-CUR-162115-3082822,LCA-1015,aalto-OPINKOHD-1126097189-20210801,teaching-participation-small-group,"Autonomous Language Learning Path (o,w), Small...",The students will devise a personalized learni...,The main goal of this course is that students ...,['Hanna Liisa Hakala'],"2020-2021 Autumn I-II,Spring III-IV , 2021-202...",2023-01-09,2023-04-21,3,3,2022-12-12,2023-01-02,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
3,aalto-CUR-162118-3082825,LCA-1022,aalto-OPINKOHD-1126098837-20210801,teaching-participation-lectures,"Academic Writing (w), Lecture",Successful participation requires that you hav...,"Upon completion of this course, students will ...",['Matthew Peter Billington'],"2020-2021 Autumn I,Autumn II,Spring III,Spring...",2023-01-11,2023-02-13,3,3,2022-12-12,2023-01-02,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
4,aalto-CUR-162124-3082831,LC-1113,aalto-OPINKOHD-1126098945-20210801,teaching-participation-small-group,"Autonomous Language Learning Path (o,w), Sma...",The students will devise a personalised learni...,The main goal of this course is that students ...,['Hanna Liisa Hakala'],"2020-2021 Autumn I-II,Spring III-V , 2021-2022...",2023-01-09,2023-04-21,3,3,2022-12-12,2023-01-02,U926,"Aalto University, Language Centre",aalto-52ed67c2-4791-4ee6-9475-547b73c8d10a,https://mycourses.aalto.fi/co...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,otm-e8183aa3-e6d7-4d44-952b-fe4c229426d8,CS-E4675,aalto-CU-1150933422-20220801,teaching-participation-project,"Full Stack Web Development D, Project",The contents of the course are openly availabl...,The course is offered by the University of Hel...,['Arto Hellas'],<p> Teaching Language : English</p><p> Teachin...,2023-01-09,2023-06-09,5,7,2022-12-12,2023-05-29,T313,Department of Computer Science,aalto-3b3aa303-843a-4a29-97ca-29c45d53f923,https://mycourses.aalto.fi/co...
981,otm-edfae58b-3d21-4fcb-a82b-3c4a789c0751,CS-E400604,otm-acc4b322-b771-43cc-b734-3121c68c0b93,teaching-participation-project,Research Experience Project in Computer Scienc...,,,['Olli Pekka Orponen'],,2023-01-01,2023-07-31,5,5,2023-01-01,2023-07-17,T313,Department of Computer Science,aalto-3b3aa303-843a-4a29-97ca-29c45d53f923,https://mycourses.aalto.fi/co...
982,otm-ee70c3a2-38e6-472b-9d38-45386b12fb98,TU-CV00011,otm-86a3f11a-6e29-4691-ab8a-19c9fce9fad6,teaching-participation-lectures,"Thinking Tools, Lectures",,,['Lauri Veikko Järvilehto'],,2023-02-28,2023-04-28,3,3,2023-01-30,2023-03-06,T307,Department of Industrial Engineering and Manag...,aalto-9a94b0a1-836d-4384-9bde-6709e8a053db,https://mycourses.aalto.fi/co...
983,otm-f08696a1-8207-45b3-9958-524bf5146c22,ENG-LV,aalto-OPINKOHD-1125772556-20210801,teaching-participation-lectures,"Course with Varying Content, V D, Lectures",,,"['Harri Juhani Koivusalo', 'Maija Kaarina Taka']",,2023-04-26,2023-05-31,1,10,2023-02-01,2023-04-16,T2,School of Engineering,aalto-f58f28c8-3503-48e3-a223-41840f239806,https://mycourses.aalto.fi/co...


In [101]:
example_course = courses.iloc[0]["content"]
example_course

'This course introduces written and oral communication principles and strategies that are applicable to professional and academic purposes and is at the same time integrated with a content course. Written tasks and oral tasks will support the content course and be largely determined by its requirements. Throughout this course, students work individually and/or in small groups to develop their presentation and writing skills. Moreover, students give and receive constructive feedback on their work and revise it accordingly. Working Life Skills: Varies according to each integration project.'

In [102]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [108]:
tokenized_text = tokenizer.tokenize(example_course)
encoded = tokenizer.encode(
                    example_course, # Sentence to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    # This function also supports truncation and conversion
                    # to pytorch tensors, but we need to do padding, so we
                    # can't use these features :( .
                    #max_length = 128,          # Truncate all sentences.
                    #return_tensors = 'pt',     # Return pytorch tensors.
               )

In [109]:
tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"] 
tokenized_text

['[CLS]',
 'this',
 'course',
 'introduces',
 'written',
 'and',
 'oral',
 'communication',
 'principles',
 'and',
 'strategies',
 'that',
 'are',
 'applicable',
 'to',
 'professional',
 'and',
 'academic',
 'purposes',
 'and',
 'is',
 'at',
 'the',
 'same',
 'time',
 'integrated',
 'with',
 'a',
 'content',
 'course',
 '.',
 'written',
 'tasks',
 'and',
 'oral',
 'tasks',
 'will',
 'support',
 'the',
 'content',
 'course',
 'and',
 'be',
 'largely',
 'determined',
 'by',
 'its',
 'requirements',
 '.',
 'throughout',
 'this',
 'course',
 ',',
 'students',
 'work',
 'individually',
 'and',
 '/',
 'or',
 'in',
 'small',
 'groups',
 'to',
 'develop',
 'their',
 'presentation',
 'and',
 'writing',
 'skills',
 '.',
 'moreover',
 ',',
 'students',
 'give',
 'and',
 'receive',
 'constructive',
 'feedback',
 'on',
 'their',
 'work',
 'and',
 'rev',
 '##ise',
 'it',
 'accordingly',
 '.',
 'working',
 'life',
 'skills',
 ':',
 'varies',
 'according',
 'to',
 'each',
 'integration',
 'project',
 

In [113]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
indexed_tokens

[101,
 2023,
 2607,
 13999,
 2517,
 1998,
 8700,
 4807,
 6481,
 1998,
 9942,
 2008,
 2024,
 12711,
 2000,
 2658,
 1998,
 3834,
 5682,
 1998,
 2003,
 2012,
 1996,
 2168,
 2051,
 6377,
 2007,
 1037,
 4180,
 2607,
 1012,
 2517,
 8518,
 1998,
 8700,
 8518,
 2097,
 2490,
 1996,
 4180,
 2607,
 1998,
 2022,
 4321,
 4340,
 2011,
 2049,
 5918,
 1012,
 2802,
 2023,
 2607,
 1010,
 2493,
 2147,
 14258,
 1998,
 1013,
 2030,
 1999,
 2235,
 2967,
 2000,
 4503,
 2037,
 8312,
 1998,
 3015,
 4813,
 1012,
 9308,
 1010,
 2493,
 2507,
 1998,
 4374,
 26157,
 12247,
 2006,
 2037,
 2147,
 1998,
 7065,
 5562,
 2009,
 11914,
 1012,
 2551,
 2166,
 4813,
 1024,
 9783,
 2429,
 2000,
 2169,
 8346,
 2622,
 1012,
 102]

In [111]:
segments_ids = [1] * len(tokenized_text)

In [114]:
import torch

tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [115]:
from transformers import BertModel

In [116]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Downloading (…)"pytorch_model.bin";: 100%|██████████| 440M/440M [00:05<00:00, 77.3MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [117]:
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [118]:
len(hidden_states)

13

In [119]:
len(indexed_tokens)

99

In [120]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 99, 768])

In [121]:
# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([13, 99, 768])

In [124]:
# `hidden_states` has shape [13 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)
sentence_embedding.size()

torch.Size([768])