In [28]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

In [26]:
'''
Step 1: Padding and tokenizing input sequence
'''
#example text data to be tokenized and padded 
text = "Data science defines the intersectionality between computer science, statistics, and domain expertise." 

#DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

#tokenize
tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
'''
add_special_tokens: adds special tokens start of sequence and end of sequence, which is needed for DistilBERT
return_tensors: returns pytorch tensors
'''

#print tokenized and padded input
tokens

tensor([[  101,  2951,  2671, 11859,  1996,  6840, 23732,  2090,  3274,  2671,
          1010,  6747,  1010,  1998,  5884, 11532,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [33]:
'''
Step 2: Implement DistilBERT encoding layer to obtain contextual embeddings for sequence
'''
#DistilBERT model
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

#forward pass
with torch.no_grad():
    outputs = model(tokens, attention_mask = (tokens != 0).float())
    
contextual_embeddings = outputs.last_hidden_state
contextual_embeddings.shape

torch.Size([1, 128, 768])

In [29]:
class CustomTextComplexityLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomTextComplexityLayer, self).__init__()
        
        # Fully connected layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)  # Output is a single value

        # Sigmoid activation to constrain the output between 0 and 1
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Pass through fully connected layers with ReLU activation
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        # Apply sigmoid activation to get a value between 0 and 1
        x = self.sigmoid(x)
        
        return x

In [None]:
#Combining previous steps to define the architecture
class TextComplexityScoringModel(nn.Module):
    def __init__(self, distilbert_model_name, hidden_size):
        super(TextComplexityScoringModel, self).__init__()

        # DistilBERT encoding layer
        self.distilbert = DistilBertModel.from_pretrained(distilbert_model_name)

        # Custom text complexity scoring layer
        self.custom_layer = CustomTextComplexityLayer(input_size=hidden_size, hidden_size=hidden_size)

    def forward(self, input_ids, attention_mask):
        # DistilBERT forward pass
        with torch.no_grad():
            outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        
        contextual_embeddings = outputs.last_hidden_state

        # Custom text complexity scoring layer forward pass
        complexity_score = self.custom_layer(contextual_embeddings)
        
        return complexity_score

In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

# Define the text complexity scoring model
class TextComplexityScoringModel(nn.Module):
    def __init__(self, distilbert_model_name, hidden_size):
        super(TextComplexityScoringModel, self).__init__()

       
        self.distilbert = DistilBertModel.from_pretrained(distilbert_model_name)
        self.custom_layer = CustomTextComplexityLayer(input_size=hidden_size, hidden_size=hidden_size)

    def forward(self, input_ids, attention_mask):
        # DistilBERT forward pass
        with torch.no_grad():
            outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        
        contextual_embeddings = outputs.last_hidden_state

        # Custom text complexity scoring layer forward pass
        complexity_score = self.custom_layer(contextual_embeddings)
        
        return complexity_score

# Define the custom text complexity scoring layer
class CustomTextComplexityLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomTextComplexityLayer, self).__init__()
        
        #connect layers
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)  # Output is a single value, 2nd parameter

        #sigmoid function
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Pass through fully connected layers with ReLU activation
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        
        # Apply sigmoid activation to get a value between 0 and 1
        x = self.sigmoid(x)
        
        return x

# Example text data
text = "Data science defines the intersectionality between computer science, statistics, and domain expertise."

# DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and prepare input
tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt') #parameters can be changed
attention_mask = (tokens != 0).float()  #attention mask

# Initialize the text complexity scoring model
model = TextComplexityScoringModel('distilbert-base-uncased', hidden_size=768)  # Hidden size matches DistilBERT

# Forward pass through the model to obtain complexity score
complexity_score = model(tokens, attention_mask)

# Print the complexity score
print(complexity_score)

In [55]:
import pandas as pd

example_string = pd.read_csv("some_csv.csv")

In [56]:
example_string = example_string["Text"][2].split()

In [58]:
last_50_words = ' '.join(example_string[-50:])
first_50_words = ' '.join(example_string[:50])

#middle words
middle_start = max(0, len(example_string) // 2 - 150)
middle_end = min(len(example_string), len(example_string) // 2 + 150)

middle_300_words = ' '.join(example_string[middle_start:middle_end])

middle_300_words

'fields count as STEM. Some STEM definitions include the , such as , economics, and anthropology. Most sources, however, consider these separate categories. U.S. Immigration and Customs Enforcement maintains a , which includes the four basic subjects above, along with architecture, psychology, digital communication, and some pharmaceutical and social sciences. Notably, fields like , , and are excluded from this list. to STEM, only officially recognized by the U.S. government in 2019. Many also and should be considered STEM. ACT Inc. includes many health and medical fields in its , giving doctors, nurses, and dentists the designation of STEM professionals. What Is a STEM Major? A STEM major is any major in a recognized STEM field. Note that colleges may have different definitions of what areas of study constitute a STEM major. Most undergraduate STEM programs culminate in a , though others may lead to a bachelor of applied science, a bachelor of engineering, or a bachelor of architectur

In [59]:
first_50_words

"STEM jobs are in high demand but suffer from a lack of qualified candidates. STEM is necessary for growing the economy and staying globally competitive. You've likely heard the term STEM, but what does it stand for? STEM is an acronym for science, technology, engineering, and math. These four fields"

In [60]:
last_50_words

"up to management positions, while others to conduct research. The BLS has identified , though this is not an exhaustive list. The following table presents some of the most popular STEM careers, as well as each job's median salary and projected employment outlook. Salary & Job Outlook for Popular STEM"

In [62]:
test_example = first_50_words + middle_300_words + last_50_words

In [65]:
# Tokenize and prepare input
tokens = tokenizer.encode(test_example, add_special_tokens=True, max_length=400, truncation=True, padding='max_length', return_tensors='pt') #parameters can be changed
attention_mask = (tokens != 0).float()  #attention mask

# Initialize the text complexity scoring model
model = TextComplexityScoringModel('distilbert-base-uncased', hidden_size=768)  # Hidden size matches DistilBERT

# Forward pass through the model to obtain complexity score
complexity_score = model(tokens, attention_mask)

# Print the complexity score
print(complexity_score)

tensor([[[0.5135],
         [0.4757],
         [0.4970],
         [0.5060],
         [0.4983],
         [0.5006],
         [0.5242],
         [0.4954],
         [0.5154],
         [0.5056],
         [0.5309],
         [0.5080],
         [0.5118],
         [0.4981],
         [0.5021],
         [0.5196],
         [0.4784],
         [0.5045],
         [0.4920],
         [0.5028],
         [0.5062],
         [0.5147],
         [0.5317],
         [0.4984],
         [0.5144],
         [0.4984],
         [0.5125],
         [0.5210],
         [0.5059],
         [0.5128],
         [0.5124],
         [0.5015],
         [0.4689],
         [0.4921],
         [0.4982],
         [0.4774],
         [0.5049],
         [0.4947],
         [0.4945],
         [0.5184],
         [0.4901],
         [0.4911],
         [0.5006],
         [0.5087],
         [0.4869],
         [0.5017],
         [0.4830],
         [0.4780],
         [0.4998],
         [0.5149],
         [0.4728],
         [0.5022],
         [0.