In [1]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

# Define the text complexity scoring model
class TextComplexityScoringModel(nn.Module):
    def __init__(self, distilbert_model_name, hidden_size, finetune_encoder = True):
        super(TextComplexityScoringModel, self).__init__()

        self.finetune_encoder = finetune_encoder
        self.distilbert = DistilBertModel.from_pretrained(distilbert_model_name)
        self.custom_layer = CustomTextComplexityLayer()

    def forward(self, input_ids, attention_mask):
        # DistilBERT forward pass
        if self.finetune_encoder:
            #outputs = self.distilbert(input_ids[:,0,:], attention_mask=attention_mask[:, 0, :], output_hidden_states=True)

            outputs = self.distilbert(input_ids, attention_mask=attention_mask, output_hidden_states=True)

        else:
            with torch.no_grad():
                #outputs = self.distilbert(input_ids[:, 0, :], attention_mask=attention_mask[:, 0, :], output_hidden_states=True)

                outputs = self.distilbert(input_ids, attention_mask=attention_mask, output_hidden_states=True)

        contextual_embeddings = outputs.last_hidden_state[:, -1, :]

        # Custom text complexity scoring layer forward pass
        complexity_score = self.custom_layer(contextual_embeddings)
        
        return complexity_score

class CustomTextComplexityLayer(nn.Module):
    def __init__(self):
        super(CustomTextComplexityLayer, self).__init__()
        self.fc1 = nn.Linear(768, 256)
        self.batch_norm1 = nn.BatchNorm1d(256)  # Batch Normalization
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)  # Batch Normalization
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# # Example text data
# text = "Data science defines the intersectionality between computer science, statistics, and domain expertise."

# DistilBERT tokenizer


# tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt')

# # Tokenize and prepare input
# tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt') #parameters can be changed
# attention_mask = (tokens != 0).float()  #attention mask

# Initialize the text complexity scoring model
model = TextComplexityScoringModel('distilbert-base-uncased', hidden_size= 768)  # Hidden size matches DistilBERT

# # Forward pass through the model to obtain complexity score
# complexity_score = model(tokens, attention_mask)

# # Print the complexity score
# print(complexity_score)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def get_parts_of_text(content):
    words = content.split()  # Splitting the content into words
    
    # First 50 words
    x = " ".join(words[:50])
    
    # Middle 300 words
    middle_index = len(words) // 2  # Integer division to get middle index
    y_start = max(0, middle_index - 150)  # Ensure index is not negative
    y_end = middle_index + 150
    y = " ".join(words[y_start:y_end])
    
    # Last 50 words
    z = " ".join(words[-50:])
    
    return x, y, z

In [1]:
import pandas as pd

In [5]:
#test
#
#
#

#normalized values, reset index
normalized_big_df = pd.read_csv("normalized_test_data.csv")

In [2]:
small_df1 = pd.read_csv("1_6thfilteredscores.csv")
small_df2 = pd.read_csv("ordered_part_4.csv")

In [3]:
big_df = pd.concat([small_df1, small_df2])

In [4]:
condition = big_df['GPTeval'] < 0
big_df.loc[condition, 'GPTeval'] = (big_df.loc[condition, 'GPTeval'] + 1) / 2
condition = big_df['GPTeval'] > 1
big_df.loc[condition, 'GPTeval'] = big_df.loc[condition, 'GPTeval'] * 0.1

# big_df.to_csv("big_df1.csv")

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch import FloatTensor
import nltk
from nltk import word_tokenize

In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # Use the first available GPU (index 0)
else:
    device = torch.device("cpu")  # Use CPU if no GPU is available

print(f"Using device: {device}")

Using device: cuda:0


In [8]:
class CustomDataset(Dataset):
    def __init__(self,dataframe):
        self.dataframe = dataframe
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.max_len = 128
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]

        text = row["Text"]
        domain = row["Domain"]
        label = row["GPTeval"]

        first_50_words, middle_300_words, last_50_words = get_parts_of_text(text)

        test_example = f"""
        Domain of Content:{domain}
        Content Length:{len(text.split())}

        Text Snippets:
        First 50 words of content: 
        {first_50_words}
        Middle 300 words of content:
        {middle_300_words}
        Last 50 words of content:
        {last_50_words}
        """

        text_inputs = self.tokenizer.encode_plus(
            text=test_example,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        ids = text_inputs["input_ids"]
        mask = text_inputs["attention_mask"]

        labels_tensor = torch.FloatTensor([label])

        return {
                    "ids": ids,
                    "masks": mask,
                    "labels": labels_tensor,
                }
        


In [9]:
train_df, val_df = train_test_split(normalized_big_df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [10]:
model = model.to(device)  # Move model to GPU

# Define loss function and optimizer
criterion = nn.MSELoss()
#optimizer = optim.SGD(model.parameters(), lr = 0.01, weight_decay=1e-4) #potentially use this
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay=1e-4)

In [11]:
num_epochs = 20
for epoch in range(num_epochs):
    optimizer.step()
    print(epoch)
    running_loss = 0.0
    for i, batch_data_dict in enumerate(train_loader, 0):
        batch_ids = batch_data_dict["ids"].to(device)
        batch_masks = batch_data_dict["masks"].to(device)
        batch_labels = batch_data_dict["labels"].to(device)
        predictions = model(batch_ids, batch_masks)
        loss = criterion(predictions, batch_labels)
        optimizer.zero_grad()
        if not torch.isnan(loss):
            loss.backward()
        print(model.distilbert.embeddings.word_embeddings.weight.grad)
        running_loss += loss.item()
        if i % 100 == 99:  # Print average loss every 100 mini-batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}")
            running_loss = 0.0

    # Step the scheduler based on the validation loss
    val_loss = 0.0
    with torch.no_grad():
        for batch_data_dict in val_loader:
            batch_ids = batch_data_dict["ids"].to(device)
            batch_masks = batch_data_dict["masks"].to(device)
            batch_labels = batch_data_dict["labels"].to(device)
            outputs = model(batch_ids, batch_masks)
            loss = criterion(outputs, batch_labels)
            val_loss += loss.item()
        print(f"[{epoch + 1}, {i + 1}] loss: {val_loss / len(val_loader):.3f}")     
print("Finished Training")

0


ValueError: too many values to unpack (expected 3)

In [44]:
print(big_df['GPTeval'].isna().sum())

0


In [5]:
big_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Domain,Title,URL,Text,GPTeval
0,1.0,0,gyandhan,What Does STEM Stand for in Education & School?,https://www.gyandhan.com/blogs/stem-education-...,STEM Education System: A Complete Guide | Gyan...,0.857
1,2.0,0,invent,What Does STEM Stand for in Education & School?,https://www.invent.org/blog/trends-stem/value-...,The Benefits of STEM Education for Children Si...,0.68
2,4.0,0,blog.teachmint,What Does STEM Stand for in Education & School?,https://blog.teachmint.com/stem-education/,What is STEM Education? All You Need to Know |...,0.68
3,5.0,0,liysf.org,What Does STEM Stand for in Education & School?,https://www.liysf.org.uk/blog/what-is-stem-edu...,What Is Stem Education? A Beginner's Guide - L...,0.57
4,6.0,0,education.gov,What Does STEM Stand for in Education & School?,https://www.education.gov.au/australian-curric...,Introductory material - What is STEM? - Depart...,0.68


In [6]:
big_df = big_df.dropna(subset=['GPTeval'])

In [7]:
big_df.loc[(big_df['GPTeval'] > 1) & (big_df['GPTeval'] < 10), 'GPTeval'] = big_df['GPTeval'] / 10

In [8]:
big_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Domain,Title,URL,Text,GPTeval
0,1.0,0,gyandhan,What Does STEM Stand for in Education & School?,https://www.gyandhan.com/blogs/stem-education-...,STEM Education System: A Complete Guide | Gyan...,0.8570
1,2.0,0,invent,What Does STEM Stand for in Education & School?,https://www.invent.org/blog/trends-stem/value-...,The Benefits of STEM Education for Children Si...,0.6800
2,4.0,0,blog.teachmint,What Does STEM Stand for in Education & School?,https://blog.teachmint.com/stem-education/,What is STEM Education? All You Need to Know |...,0.6800
3,5.0,0,liysf.org,What Does STEM Stand for in Education & School?,https://www.liysf.org.uk/blog/what-is-stem-edu...,What Is Stem Education? A Beginner's Guide - L...,0.5700
4,6.0,0,education.gov,What Does STEM Stand for in Education & School?,https://www.education.gov.au/australian-curric...,Introductory material - What is STEM? - Depart...,0.6800
...,...,...,...,...,...,...,...
1334,,0,turbofuture,What Is The Purpose Of The Central Processing ...,https://turbofuture.com/computers/What-are-the...,What Are the Main Functions of a CPU? - TurboF...,0.7520
1335,,0,totalphase,What Is The Purpose Of The Central Processing ...,https://www.totalphase.com/blog/2022/08/what-i...,What is a CPU and What Does it Do? - Total Pha...,0.8750
1336,,0,learnlearn,What Is The Purpose Of The Central Processing ...,https://learnlearn.uk/alevelcs/the-cpu/,The CPU - A Level Computer Science Navigation ...,0.8370
1337,,0,totalphase,What Is The Purpose Of The Central Processing ...,https://www.totalphase.com/blog/2022/08/what-i...,What is a CPU and What Does it Do? - Total Pha...,0.8750


In [9]:
# Assuming df is your original DataFrame
normalized_big_df = big_df.copy()  # Make a copy of the original DataFrame

# Normalize the "GPTeval" column in normalized_big_df
min_value = normalized_big_df['GPTeval'].min()
max_value = normalized_big_df['GPTeval'].max()
normalized_big_df['GPTeval'] = (normalized_big_df['GPTeval'] - min_value) / (max_value - min_value)

# Drop the "Unnamed: 0.1" and "Unnamed: 0" columns
normalized_big_df = normalized_big_df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

# Reset the index to start from 0
normalized_big_df.reset_index(drop=True, inplace=True)

normalized_big_df.to_csv("normalized_test_data.csv")

In [10]:
normalized_big_df

Unnamed: 0,Domain,Title,URL,Text,GPTeval
0,gyandhan,What Does STEM Stand for in Education & School?,https://www.gyandhan.com/blogs/stem-education-...,STEM Education System: A Complete Guide | Gyan...,0.8570
1,invent,What Does STEM Stand for in Education & School?,https://www.invent.org/blog/trends-stem/value-...,The Benefits of STEM Education for Children Si...,0.6800
2,blog.teachmint,What Does STEM Stand for in Education & School?,https://blog.teachmint.com/stem-education/,What is STEM Education? All You Need to Know |...,0.6800
3,liysf.org,What Does STEM Stand for in Education & School?,https://www.liysf.org.uk/blog/what-is-stem-edu...,What Is Stem Education? A Beginner's Guide - L...,0.5700
4,education.gov,What Does STEM Stand for in Education & School?,https://www.education.gov.au/australian-curric...,Introductory material - What is STEM? - Depart...,0.6800
...,...,...,...,...,...
2051,turbofuture,What Is The Purpose Of The Central Processing ...,https://turbofuture.com/computers/What-are-the...,What Are the Main Functions of a CPU? - TurboF...,0.7520
2052,totalphase,What Is The Purpose Of The Central Processing ...,https://www.totalphase.com/blog/2022/08/what-i...,What is a CPU and What Does it Do? - Total Pha...,0.8750
2053,learnlearn,What Is The Purpose Of The Central Processing ...,https://learnlearn.uk/alevelcs/the-cpu/,The CPU - A Level Computer Science Navigation ...,0.8370
2054,totalphase,What Is The Purpose Of The Central Processing ...,https://www.totalphase.com/blog/2022/08/what-i...,What is a CPU and What Does it Do? - Total Pha...,0.8750


In [None]:
big_df

In [11]:
normalized_big_df["GPTeval"][130]

0.62

In [95]:
normalized_big_df.to_csv("normalized_test_data.csv")

In [1]:
import pandas as pd

In [2]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel

# Define the text complexity scoring model
class TextComplexityScoringModel(nn.Module):
    def __init__(self, distilbert_model_name, hidden_size, finetune_encoder = True):
        super(TextComplexityScoringModel, self).__init__()

        self.finetune_encoder = finetune_encoder
        self.distilbert = DistilBertModel.from_pretrained(distilbert_model_name)
        self.custom_layer = CustomTextComplexityLayer()
        
        if self.finetune_encoder:
            for param in self.distilbert.parameters():
                param.requires_grad = True
        else:
            for param in self.distilbert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # DistilBERT forward pass
        if self.finetune_encoder:
            #outputs = self.distilbert(input_ids[:,0,:], attention_mask=attention_mask[:, 0, :], output_hidden_states=True)
  
        
            
            outputs = self.distilbert(input_ids, attention_mask=attention_mask, output_hidden_states=True)

            
        else:
            with torch.no_grad():
                #outputs = self.distilbert(input_ids[:, 0, :], attention_mask=attention_mask[:, 0, :], output_hidden_states=True)
       
                outputs = self.distilbert(input_ids, attention_mask=attention_mask, output_hidden_states=True)

                
                
        contextual_embeddings = outputs.last_hidden_state[:, -1, :]

        # Custom text complexity scoring layer forward pass
        complexity_score = self.custom_layer(contextual_embeddings)
        
        
        
        return complexity_score

class CustomTextComplexityLayer(nn.Module):
    def __init__(self):
        super(CustomTextComplexityLayer, self).__init__()
        self.fc1 = nn.Linear(768, 256)
        self.batch_norm1 = nn.BatchNorm1d(256)  # Batch Normalization
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)  # Batch Normalization
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

# # Example text data
# text = "Data science defines the intersectionality between computer science, statistics, and domain expertise."

# DistilBERT tokenizer


# tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt')

# # Tokenize and prepare input
# tokens = tokenizer.encode(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt') #parameters can be changed
# attention_mask = (tokens != 0).float()  #attention mask

# Initialize the text complexity scoring model
model = TextComplexityScoringModel('distilbert-base-uncased', hidden_size= 768)  # Hidden size matches DistilBERT

# # Forward pass through the model to obtain complexity score
# complexity_score = model(tokens, attention_mask)

# # Print the complexity score
# print(complexity_score)










def get_parts_of_text(content):
    words = content.split()  # Splitting the content into words
    
    # First 50 words
    x = " ".join(words[:50])
    
    # Middle 300 words
    middle_index = len(words) // 2  # Integer division to get middle index
    y_start = max(0, middle_index - 150)  # Ensure index is not negative
    y_end = middle_index + 150
    y = " ".join(words[y_start:y_end])
    
    # Last 50 words
    z = " ".join(words[-50:])
    
    return x, y, z



class CustomDataset(Dataset):
    def __init__(self,dataframe):
        self.dataframe = dataframe
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.max_len = 128
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]

        text = row["Text"]
        domain = row["Domain"]
        label = row["GPTeval"]

        first_50_words, middle_300_words, last_50_words = get_parts_of_text(text)

        test_example = f"""
        Domain of Content:{domain}
        Content Length:{len(text.split())}

        Text Snippets:
        First 50 words of content: 
        {first_50_words}
        Middle 300 words of content:
        {middle_300_words}
        Last 50 words of content:
        {last_50_words}
        """

        text_inputs = self.tokenizer.encode_plus(
            text= test_example,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        ids = text_inputs["input_ids"]
        mask = text_inputs["attention_mask"]

        labels_tensor = torch.FloatTensor([label])

        return {
                    "ids": ids,
                    "masks": mask,
                    "labels": labels_tensor,
                }
        
model = model.to(device)  # Move model to GPU

# # Define loss function and optimizer
# criterion = nn.MSELoss()
# #optimizer = optim.SGD(model.parameters(), lr = 0.01, weight_decay=1e-4) #potentially use this
# optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay=1e-4)

# model.train()
# for epoch in range(num_epochs):
#     print("Epoch:", epoch)
#     running_loss = 0.0
    
#     for i, batch_data_dict in enumerate(train_loader, 0):
        
#         # Zero the parameter gradients
#         optimizer.zero_grad()
        
#         # Get the inputs and move them to the device
#         batch_ids = batch_data_dict["ids"].squeeze(1).to(device)  # Remove dimension of size 1

#         batch_masks = batch_data_dict["masks"].squeeze(1).to(device)  # Remove dimension of size 1
#         batch_labels = batch_data_dict["labels"].to(device)
        
#         print(batch_ids.shape, batch_masks.shape, batch_labels.shape)  # Debug Step 1

#         # Forward pass
#         predictions = model(batch_ids, batch_masks)
        
#         print(predictions.shape, predictions)  # Debug Step 2

#         # Compute the loss
#         loss = criterion(predictions, batch_labels)
        
#         print(loss.item())  # Debug Step 3

#         # Backward pass and optimization
#         if not torch.isnan(loss):
#             loss.backward()
#             if any(param.grad is None for param in model.parameters()):
#                 print("Gradients not updated for some parameters!")
            
#             optimizer.step()

#         print(model.distilbert.embeddings.word_embeddings.weight.grad)  # Debug Step 4
        
#         # Print statistics
#         running_loss += loss.item()
#         if i % 100 == 99:  # Print every 100 mini-batches
#             print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}")
#             running_loss = 0.0

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'Dataset' is not defined

In [None]:
model.train()
for epoch in range(num_epochs):
    print("Epoch:", epoch)
    running_loss = 0.0
    
    for i, batch_data_dict in enumerate(train_loader, 0):
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Get the inputs and move them to the device
        batch_ids = batch_data_dict["ids"].squeeze(1).to(device)  # Remove dimension of size 1

        batch_masks = batch_data_dict["masks"].squeeze(1).to(device)  # Remove dimension of size 1
        batch_labels = batch_data_dict["labels"].to(device)
        
        print(batch_ids.shape, batch_masks.shape, batch_labels.shape)  # Debug Step 1

        # Forward pass
        predictions = model(batch_ids, batch_masks)
        
        print(predictions.shape, predictions)  # Debug Step 2

        # Compute the loss
        loss = criterion(predictions, batch_labels)
        
        print(loss.item())  # Debug Step 3

        # Backward pass and optimization
        if not torch.isnan(loss):
            loss.backward()
            optimizer.step()

        print(model.distilbert.embeddings.word_embeddings.weight.grad)  # Debug Step 4
        
        # Print statistics
        running_loss += loss.item()
        if i % 100 == 99:  # Print every 100 mini-batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}")
            running_loss = 0.0

In [136]:
print(batch_ids.shape, batch_masks.shape, batch_labels.shape)


torch.Size([32, 1, 128]) torch.Size([32, 1, 128]) torch.Size([32, 1])


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class SimpleTextComplexityModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SimpleTextComplexityModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.fc1 = nn.Linear(embed_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x, dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return {
            'text': torch.tensor(self.texts[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Initialize model, loss, and optimizer
vocab_size = 5000  # Example vocab size
embed_size = 100  # Example embedding size

model = SimpleTextComplexityModel(vocab_size, embed_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example data
texts = [[1, 2, 3], [2, 3, 4], [4, 5, 6]]  # Replace with actual tokenized texts
labels = [0.2, 0.5, 0.8]  # Replace with actual complexity labels

# Create DataLoader
dataset = CustomDataset(texts, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for i, batch in enumerate(dataloader):
        texts = batch['text']
        labels = batch['label'].view(-1, 1)

        # Forward pass
        outputs = model(texts)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {loss.item():.4f}')


In [1]:
import pandas as pd

In [2]:
normalized_big_df = pd.read_csv("normalized_test_data.csv")

In [3]:
# Assuming normalized_big_df contains your data
# Extract the texts and complexity labels
texts = normalized_big_df['Text'].tolist()
labels = normalized_big_df['GPTeval'].tolist()

# # Initialize a list to store the transformed data
# transformed_data = []

# # Iterate through the texts and labels
# for text, label in zip(texts, labels):
#     # Split the text into tokens
#     tokens = text.split()
    
#     # Calculate the lengths of the beginning, middle, and end sections
#     total_tokens = len(tokens)
#     beg_length = int(total_tokens * label)  # Length of the "beginning"
#     mid_length = int(total_tokens * (1 - label) / 2)  # Length of the "middle"
#     end_length = total_tokens - beg_length - mid_length  # Length of the "end"
    
#     # Create the "beginning," "middle," and "end" sections
#     beg_tokens = tokens[:beg_length]
#     mid_tokens = tokens[beg_length: beg_length + mid_length]
#     end_tokens = tokens[-end_length:]
    
#     # Append the sections to the transformed_data list
#     transformed_data.append([beg_tokens, mid_tokens, end_tokens])



In [4]:
domains = normalized_big_df['Domain'].tolist()

In [5]:
def get_parts_of_text(content):
    words = content.split()  # Splitting the content into words
    
    # First 50 words
    x = " ".join(words[:50])
    
    # Middle 300 words
    middle_index = len(words) // 2  # Integer division to get middle index
    y_start = max(0, middle_index - 150)  # Ensure index is not negative
    y_end = middle_index + 150
    y = " ".join(words[y_start:y_end])
    
    # Last 50 words
    z = " ".join(words[-50:])
    
    return x, y, z


In [6]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1).to(device)

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        content_len = len(text.split())

        first_50_words, middle_300_words, last_50_words = get_parts_of_text(text)

        test_example = f"""
        Total Content Length:{content_len}

        Text Snippets:
        First 50 words of content:
        {first_50_words}
        Middle 300 words of content:
        {middle_300_words}
        Last 50 words of content:
        {last_50_words}
        """

        encoding = self.tokenizer.encode_plus(
            test_example,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'text': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Splitting data
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# DataLoader
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len=512)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len=512)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Loss & Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.MSELoss()

# Training Loop with Early Stopping
n_epochs = 10
best_val_loss = float('inf')
counter = 0

for epoch in range(n_epochs):
    model.train()
    for i, batch in enumerate(train_dataloader):
        input_ids = batch['text'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].view(-1, 1).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)[0]
        loss = loss_fn(outputs.to(device), labels)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        print(f"Epoch: {epoch}, Loss:  {loss.item()}")
    
    # Validation Loss
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(val_dataloader):
            input_ids = batch['text'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].view(-1, 1).to(device)
            outputs = model(input_ids, attention_mask=attention_mask)[0]
            loss = loss_fn(outputs.to(device), labels)
            val_loss += loss.item()
    
    val_loss /= len(val_dataloader)
    print(f"Validation Loss after Epoch {epoch}: {val_loss}")
    
    # Check for Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
    else:
        counter += 1


cuda:0


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

Epoch: 0, Loss:  0.5176093578338623
Epoch: 0, Loss:  0.37415120005607605
Epoch: 0, Loss:  0.2319658249616623
Epoch: 0, Loss:  0.11264555156230927
Epoch: 0, Loss:  0.04937262460589409
Epoch: 0, Loss:  0.053924813866615295
Epoch: 0, Loss:  0.07074303925037384
Epoch: 0, Loss:  0.0738525241613388
Epoch: 0, Loss:  0.06737595796585083
Epoch: 0, Loss:  0.039350394159555435
Epoch: 0, Loss:  0.061157990247011185
Epoch: 0, Loss:  0.060907959938049316
Epoch: 0, Loss:  0.08023978024721146
Epoch: 0, Loss:  0.05687671899795532
Epoch: 0, Loss:  0.05101495981216431
Epoch: 0, Loss:  0.053335197269916534
Epoch: 0, Loss:  0.05308596044778824
Epoch: 0, Loss:  0.06703256070613861
Epoch: 0, Loss:  0.03021511435508728
Epoch: 0, Loss:  0.05303748697042465
Epoch: 0, Loss:  0.041867610067129135
Epoch: 0, Loss:  0.059417903423309326
Epoch: 0, Loss:  0.035764455795288086
Validation Loss after Epoch 0: 0.04216879084706306
Epoch: 1, Loss:  0.04588444158434868
Epoch: 1, Loss:  0.050002146512269974
Epoch: 1, Loss:  0

In [7]:
model_save_path = "C:/Users/William Zhang/OneDrive/Desktop/Search_Sense_Model"
model.save_pretrained(model_save_path)

In [15]:
# Define a function to make predictions
def make_prediction(text, model, tokenizer):

    content_len = len(text.split())
    first_50_words, middle_300_words, last_50_words = get_parts_of_text(text)

    text_final_input = f"""
    Total Content Length:{content_len}

    Text Snippets:
    First 50 words of content:
    {first_50_words}
    Middle 300 words of content:
    {middle_300_words}
    Last 50 words of content:
    {last_50_words}
    """
    encoding = tokenizer.encode_plus(
        text_final_input,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Set model to evaluation mode and make prediction
    model.eval()
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)[0]
    
    # Convert output to label
    predicted_score = output.item()
    
    return predicted_score

In [9]:
import WebScrape

[nltk_data] Downloading package punkt to C:\Users\William
[nltk_data]     Zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\William
[nltk_data]     Zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\William
[nltk_data]     Zhang\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
hi = WebScrape.get_df()

Scraping...:   0%|          | 0/1 [00:00<?, ?it/s]

Error: Failed to retrieve content for:Error: Failed to retrieve content for: https://www.seldon.io/supervised-vs-unsupervised-learning-explained#:~:text=Supervised%20machine%20learning%20is%20generally,the%20need%20for%20labelled%20data.
Exception: HTTPSConnectionPool(host='www.seldon.io', port=443): Max retries exceeded with url: /supervised-vs-unsupervised-learning-explained (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002AF03182CE0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
 https://www.seldon.io/supervised-vs-unsupervised-learning-explained#:~:text=Supervised%20machine%20learning%20is%20generally,the%20need%20for%20labelled%20data.
Exception: HTTPSConnectionPool(host='www.seldon.io', port=443): Max retries exceeded with url: /supervised-vs-unsupervised-learning-explained (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002AF03183940>: Failed to establish a new connection: [Errno 1

Scraping...: 100%|██████████| 1/1 [00:08<00:00,  8.74s/it]


Grabbed search results
34


In [11]:
preprocess_df = hi.copy()

In [12]:
#remove yt videos
preprocess_df = preprocess_df[~preprocess_df['Domain'].str.contains('youtube')]
#remove null or empty values
preprocess_df.dropna(subset=["Text"], inplace=True)
#remove entries with less than 25 words
preprocess_df = preprocess_df[preprocess_df["Text"].apply(lambda x: len(x.split()) >= 25)]

In [19]:
preprocess_df["URL"][18]

'https://www.codecademy.com/article/machine-learning-supervised-vs-unsupervised'

In [13]:
preprocess_df["URL"][2]

'https://www.alteryx.com/glossary/supervised-vs-unsupervised-learning'

In [20]:
score = make_prediction(preprocess_df["Text"][18], model, tokenizer)

In [21]:
score

0.348219096660614

In [212]:
preprocess_df["URL"][29]

'https://www.linkedin.com/advice/0/what-differences-similarities-between-supervised'

In [213]:
score = make_prediction(preprocess_df["Text"][29], model, tokenizer)

In [214]:
score

0.9222419261932373

In [217]:
preprocess_df

Unnamed: 0,Domain,Title,URL,Text
1,seldon,Supervised and Unsupervised Learning compared ...,https://www.seldon.io/supervised-vs-unsupervis...,Supervised vs Unsupervised Learning Explained ...
2,seldon,Supervised and Unsupervised Learning compared ...,https://www.seldon.io/supervised-vs-unsupervis...,Supervised vs Unsupervised Learning Explained ...
3,geeksforgeeks,Supervised and Unsupervised Learning compared ...,https://www.geeksforgeeks.org/supervised-unsup...,Supervised and Unsupervised learning - Geeksfo...
4,simplilearn,Supervised and Unsupervised Learning compared ...,https://www.simplilearn.com/tutorials/machine-...,Supervised and Unsupervised Learning in (Machi...
6,alteryx,Supervised and Unsupervised Learning compared ...,https://www.alteryx.com/glossary/supervised-vs...,Supervised vs. Unsupervised Learning; Which Is...
7,v7labs,Supervised and Unsupervised Learning compared ...,https://www.v7labs.com/blog/supervised-vs-unsu...,Supervised vs. Unsupervised Learning [Differen...
8,cs.stackexchange,Supervised and Unsupervised Learning compared ...,https://cs.stackexchange.com/questions/2907/wh...,data mining - What exactly is the difference b...
10,techtarget,Supervised and Unsupervised Learning compared ...,https://www.techtarget.com/searchenterpriseai/...,Comparing Supervised vs. Unsupervised Learning...
11,towardsdatascience,Supervised and Unsupervised Learning compared ...,https://towardsdatascience.com/supervised-vs-u...,Supervised vs. Unsupervised Learning | by Devi...
16,tutorialforbeginner,Supervised and Unsupervised Learning compared ...,https://tutorialforbeginner.com/supervised-vs-...,Supervised vs Unsupervised Machine Learning | ...
