In [17]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

In [18]:
'''
Step 1: Padding and tokenizing input sequence
'''
#example text data to be tokenized and padded 
text = "Data science defines the intersectionality between computer science, statistics, and domain expertise." 

#DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

#tokenize
tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
'''
add_special_tokens: adds special tokens start of sequence and end of sequence, which is needed for DistilBERT
return_tensors: returns pytorch tensors
'''

#print tokenized and padded input
tokens

tensor([[  101,  2951,  2671, 11859,  1996,  6840, 23732,  2090,  3274,  2671,
          1010,  6747,  1010,  1998,  5884, 11532,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [19]:
'''
Step 2: Implement DistilBERT encoding layer to obtain contextual embeddings for sequence
'''
#DistilBERT model
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

#forward pass
with torch.no_grad():
    outputs = model(tokens, attention_mask = (tokens != 0).float())
    
contextual_embeddings = outputs.last_hidden_state
contextual_embeddings.shape

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([1, 128, 768])

In [20]:
class CustomTextComplexityLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomTextComplexityLayer, self).__init__()
        
        # Fully connected layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)  # Output is a single value

        # Sigmoid activation to constrain the output between 0 and 1
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Pass through fully connected layers with ReLU activation
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        # Apply sigmoid activation to get a value between 0 and 1
        x = self.sigmoid(x)
        
        return x

In [21]:
#Combining previous steps to define the architecture
class TextComplexityScoringModel(nn.Module):
    def __init__(self, distilbert_model_name, hidden_size):
        super(TextComplexityScoringModel, self).__init__()

        # DistilBERT encoding layer
        self.distilbert = DistilBertModel.from_pretrained(distilbert_model_name)

        # Custom text complexity scoring layer
        self.custom_layer = CustomTextComplexityLayer(input_size=hidden_size, hidden_size=hidden_size)

    def forward(self, input_ids, attention_mask):
        # DistilBERT forward pass
        with torch.no_grad():
            outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        
        contextual_embeddings = outputs.last_hidden_state

        # Custom text complexity scoring layer forward pass
        complexity_score = self.custom_layer(contextual_embeddings)
        
        return complexity_score

In [22]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

# Define the text complexity scoring model
class TextComplexityScoringModel(nn.Module):
    def __init__(self, distilbert_model_name, hidden_size, finetune_encoder = True):
        super(TextComplexityScoringModel, self).__init__()

        self.finetune_encoder = finetune_encoder
        self.distilbert = DistilBertModel.from_pretrained(distilbert_model_name)
        self.custom_layer = CustomTextComplexityLayer(input_size=hidden_size, hidden_size=hidden_size)

    def forward(self, input_ids, attention_mask):
        # DistilBERT forward pass
        if self.finetune_encoder:
            outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        else:
            with torch.no_grad():
                outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        
        contextual_embeddings = outputs.last_hidden_state

        # Custom text complexity scoring layer forward pass
        complexity_score = self.custom_layer(contextual_embeddings)
        
        return complexity_score

# Define the custom text complexity scoring layer
class CustomTextComplexityLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomTextComplexityLayer, self).__init__()
        
        #connect layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)  # Output is a single value, 2nd parameter

        #sigmoid function
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Pass through fully connected layers with ReLU activation
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        # Apply sigmoid activation to get a value between 0 and 1
        x = self.sigmoid(x)
        
        return x

# # Example text data
# text = "Data science defines the intersectionality between computer science, statistics, and domain expertise."

# DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# # Tokenize and prepare input
# tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt') #parameters can be changed
# attention_mask = (tokens != 0).float()  #attention mask

# Initialize the text complexity scoring model
model = TextComplexityScoringModel('distilbert-base-uncased', hidden_size=768)  # Hidden size matches DistilBERT

# # Forward pass through the model to obtain complexity score
# complexity_score = model(tokens, attention_mask)

# # Print the complexity score
# print(complexity_score)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
example_string = example_string["Text"][2].split()

In [None]:
last_50_words = ' '.join(example_string[-50:])
first_50_words = ' '.join(example_string[:50])

#middle words
middle_start = max(0, len(example_string) // 2 - 150)
middle_end = min(len(example_string), len(example_string) // 2 + 150)

middle_300_words = ' '.join(example_string[middle_start:middle_end])

middle_300_words

'fields count as STEM. Some STEM definitions include the , such as , economics, and anthropology. Most sources, however, consider these separate categories. U.S. Immigration and Customs Enforcement maintains a , which includes the four basic subjects above, along with architecture, psychology, digital communication, and some pharmaceutical and social sciences. Notably, fields like , , and are excluded from this list. to STEM, only officially recognized by the U.S. government in 2019. Many also and should be considered STEM. ACT Inc. includes many health and medical fields in its , giving doctors, nurses, and dentists the designation of STEM professionals. What Is a STEM Major? A STEM major is any major in a recognized STEM field. Note that colleges may have different definitions of what areas of study constitute a STEM major. Most undergraduate STEM programs culminate in a , though others may lead to a bachelor of applied science, a bachelor of engineering, or a bachelor of architectur

In [None]:
first_50_words

"STEM jobs are in high demand but suffer from a lack of qualified candidates. STEM is necessary for growing the economy and staying globally competitive. You've likely heard the term STEM, but what does it stand for? STEM is an acronym for science, technology, engineering, and math. These four fields"

In [None]:
last_50_words

"up to management positions, while others to conduct research. The BLS has identified , though this is not an exhaustive list. The following table presents some of the most popular STEM careers, as well as each job's median salary and projected employment outlook. Salary & Job Outlook for Popular STEM"

In [23]:
def get_parts_of_text(content):
    words = content.split()  # Splitting the content into words
    
    # First 50 words
    x = " ".join(words[:50])
    
    # Middle 300 words
    middle_index = len(words) // 2  # Integer division to get middle index
    y_start = max(0, middle_index - 150)  # Ensure index is not negative
    y_end = middle_index + 150
    y = " ".join(words[y_start:y_end])
    
    # Last 50 words
    z = " ".join(words[-50:])
    
    return x, y, z

In [None]:
first_50_words, middle_300_words, last_50_words = get_parts_of_text(example_string) 
domain = example_string["domain"][2]
content_length = len(text.split())

In [None]:
test_example = f"""
General information:
Domain of Content:{domain}
Content Length:{content_length}

Text Snippets:
First 50 words of content: 
{first_50_words}
Middle 300 words of content:
{middle_300_words}
Last 50 words of content:
{last_50_words}
"""

In [6]:
# Tokenize and prepare input
tokens = tokenizer.encode(test_example, add_special_tokens=True, max_length=400, truncation=True, padding='max_length', return_tensors='pt') #parameters can be changed
attention_mask = (tokens != 0).float()  #attention mask

# Initialize the text complexity scoring model
model = TextComplexityScoringModel('distilbert-base-uncased', hidden_size=768)  # Hidden size matches DistilBERT

# Forward pass through the model to obtain complexity score
complexity_score = model(tokens, attention_mask)

# Print the complexity score
print(complexity_score)

NameError: name 'test_example' is not defined

In [24]:
import pandas as pd

In [25]:
small_df1 = pd.read_csv("1_6thfilteredscores.csv")
small_df2 = pd.read_csv("ordered_part_4.csv")

In [26]:
big_df = pd.concat([small_df1, small_df2])

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch import FloatTensor
from nltk import word_tokenize

In [28]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [29]:
class CustomDataset(Dataset):
    def __init__(self,dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]

        text = row["Text"]
        domain = row["Domain"]
        label = row["GPTeval"]

        first_50_words, middle_300_words, last_50_words = get_parts_of_text(text)

        first_50_tokens = tokenizer.encode(first_50_words, add_special_tokens=False, max_length=50, truncation=True)
        middle_300_tokens = tokenizer.encode(middle_300_words, add_special_tokens=False, max_length=300, truncation=True)
        last_50_tokens = tokenizer.encode(last_50_words, add_special_tokens=False, max_length=50, truncation=True)

        domain_tokens = tokenizer.encode(f"Domain of Content: {domain}", add_special_tokens=False)
        length_tokens = tokenizer.encode(f"Content Length: {len(word_tokenize(text))}", add_special_tokens=False)

        all_tokens = ([tokenizer.cls_token_id] + 
                  domain_tokens + [tokenizer.sep_token_id] +
                  length_tokens + [tokenizer.sep_token_id] +
                  first_50_tokens + [tokenizer.sep_token_id] + 
                  middle_300_tokens + [tokenizer.sep_token_id] + 
                  last_50_tokens + [tokenizer.sep_token_id])
        
        max_len = 512
        padding_len = max_len - len(all_tokens)
        all_tokens = all_tokens + ([tokenizer.pad_token_id] * padding_len) 
        attention_mask = (tokens != 0).float()

        inputs_tensor = torch.tensor(all_tokens, dtype=torch.long)
        labels_tensor =torch.FloatTensor([label])

        return inputs_tensor, labels_tensor, attention_mask    

In [30]:
train_df, val_df = train_test_split(big_df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

In [31]:
model = model.to(device)  # Move model to GPU

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

In [32]:
num_epochs = 20
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels, attention_mask) in enumerate(train_loader, 0):
        inputs, labels, attention_mask = inputs.to(device), labels.to(device), attention_mask.to(device)
        predictions = model(inputs, attention_mask)
        loss = criterion(predictions, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 99:  # Print average loss every 100 mini-batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}")
            running_loss = 0.0

    # Step the scheduler based on the validation loss
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
        print(f"[{epoch + 1}, {i + 1}] loss: {val_loss / len(val_loader):.3f}")     
print("Finished Training")

RuntimeError: shape '[256, 1, 1, 512]' is invalid for input of size 32768