In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
!pwd


/home/orolol


In [3]:
df = pd.read_csv('workspace/learningLabAgency/data/train.csv')
df.head()

# df = df.head(128)

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [4]:
# Clean the data, remove stop words, lowercase, remove punctuation, etc.
from nltk.corpus import stopwords
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from textblob import TextBlob

nltk.download('stopwords')

stopwords = set(stopwords.words('english'))

def clean_data(line):
    line = line.lower()
    line = line.translate(str.maketrans('', '', string.punctuation))
    line = ' '.join([word for word in line.split() if word not in stopwords])
    return line

df['full_text'] = df['full_text'].apply(clean_data)

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(df['full_text'])

idf_values = vectorizer.idf_
word_idf_dict = dict(zip(vectorizer.get_feature_names_out(), idf_values))

def calculate_idf_feature(text, word_idf_dict):
    words = text.split()
    idf_values = [word_idf_dict.get(word, 0) for word in words]
    return np.mean(idf_values)

df['text_length'] = df['full_text'].apply(lambda x: len(x))

df['idf_feature'] = df['full_text'].apply(lambda x: calculate_idf_feature(x, word_idf_dict))

df['sentiment'] = df['full_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

df['unique_word_count'] = df['full_text'].apply(lambda x: len(set(x.split())))

most_common_word = pd.Series(' '.join(df['full_text']).split()).value_counts().idxmax()
df['most_common_word_freq'] = df['full_text'].apply(lambda x: x.split().count(most_common_word))


df.head()

[nltk_data] Downloading package stopwords to /home/orolol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,essay_id,full_text,score,text_length,idf_feature,sentiment,unique_word_count,most_common_word_freq
0,000d118,many people car live thing dont know use car a...,3,1641,4.238982,0.234616,177,0
1,000fe60,scientist nasa discussing face mars explaining...,3,882,3.550714,0.109615,89,3
2,001ab80,people always wish technology seen movies best...,4,1888,3.491977,0.172277,162,0
3,001bdc0,heard venus planet without almost oxygen earth...,4,1889,3.824,0.13221,170,4
4,002ba53,dear state senator letter argue favor keeping ...,3,1456,4.582345,0.131378,103,0


In [5]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

normalized_features = scaler.fit_transform(df[['text_length', 'sentiment', 'unique_word_count', 'most_common_word_freq', 'idf_feature']])

df[['text_length', 'sentiment', 'unique_word_count', 'most_common_word_freq', 'idf_feature']] = normalized_features

df.head()

Unnamed: 0,essay_id,full_text,score,text_length,idf_feature,sentiment,unique_word_count,most_common_word_freq
0,000d118,many people car live thing dont know use car a...,3,0.588805,1.178515,0.904791,1.19792,-0.806105
1,000fe60,scientist nasa discussing face mars explaining...,3,-0.72809,-0.749151,-0.204794,-0.68699,0.135805
2,001ab80,people always wish technology seen movies best...,4,1.01736,-0.913658,0.351432,0.876628,-0.806105
3,001bdc0,heard venus planet without almost oxygen earth...,4,1.019095,0.016255,-0.004233,1.047984,0.449775
4,002ba53,dear state senator letter argue favor keeping ...,3,0.267823,2.140191,-0.011619,-0.387118,-0.806105


In [6]:
from gensim import corpora, models

num_topics = 20

texts = df['full_text'].apply(lambda x: x.split())

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=2)

df['topics'] = df['full_text'].apply(lambda x: dict(lda_model[dictionary.doc2bow(x.split())]))
for i in range(num_topics):  # Assuming you have 10 topics
    df[f'topic_{i}'] = df['topics'].apply(lambda x: x.get(i, 0))

# Now you can drop the original 'topics' column
df = df.drop(columns=['topics'])

df.head()


Unnamed: 0,essay_id,full_text,score,text_length,idf_feature,sentiment,unique_word_count,most_common_word_freq,topic_0,topic_1,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,000d118,many people car live thing dont know use car a...,3,0.588805,1.178515,0.904791,1.19792,-0.806105,0.0,0.0,...,0.0,0.0,0.0,0.715285,0.0,0.0,0.0,0.0,0.0,0.0
1,000fe60,scientist nasa discussing face mars explaining...,3,-0.72809,-0.749151,-0.204794,-0.68699,0.135805,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,001ab80,people always wish technology seen movies best...,4,1.01736,-0.913658,0.351432,0.876628,-0.806105,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.945746,0.0,0.0,0.0
3,001bdc0,heard venus planet without almost oxygen earth...,4,1.019095,0.016255,-0.004233,1.047984,0.449775,0.0,0.209764,...,0.0,0.0,0.0,0.0,0.748647,0.0,0.0,0.0,0.0,0.0
4,002ba53,dear state senator letter argue favor keeping ...,3,0.267823,2.140191,-0.011619,-0.387118,-0.806105,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.994689,0.0


In [7]:
import torch
from torch import nn
from transformers import AutoModel

class TransformerScorer(nn.Module):
    def __init__(self, transformer_model_name):
        super(TransformerScorer, self).__init__()
        self.transformer = AutoModel.from_pretrained(transformer_model_name)
        in_features = 5 + num_topics 
        self.classifier = nn.Sequential(
            nn.Linear(in_features ,in_features),
            nn.BatchNorm1d(in_features),
            nn.ReLU(),
            nn.Linear(in_features ,32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32 ,16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16 ,8),
            nn.BatchNorm1d(8),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(8, 1)
        )

        
    def forward(self, input_ids, attention_mask, extra_features):
        logits = self.classifier(extra_features)
        
        return logits

model_name = "bert-base-uncased" 

In [8]:


from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_embed = BertModel.from_pretrained('bert-base-uncased')



In [9]:
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Define the device


class EssayDataset(Dataset):
    
    def __init__(self, df, tokenizer, model):
        self.df = df
        self.tokenizer = tokenizer
        self.model = model

        # Tokenize and embed the entire dataset at once
        self.input_ids, self.attention_masks, self.extra_features = self.tokenize_and_embed(df)

    def tokenize_and_embed(self, df):
        input_ids = []
        attention_masks = []
        extra_features = []
        
        texts = df['full_text'].values

        for idx, text in enumerate(texts):
            tokens = self.tokenizer.tokenize(text)
            if len(tokens) > 512:
                tokens = tokens[:512]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            if len(token_ids) < 512:
                token_ids += [0] * (512 - len(token_ids))
            attention_mask = [1 if token_id != 0 else 0 for token_id in token_ids]
            input_ids.append(token_ids)
            attention_masks.append(attention_mask)

            extra_features.append([ df['text_length'][idx], df['idf_feature'][idx], df['sentiment'][idx], df['unique_word_count'][idx], df['most_common_word_freq'][idx]])
            
            for i in range(num_topics):
                extra_features[-1].append(df[f'topic_{i}'][idx])
            

        # Convert lists to tensors
        input_ids = torch.tensor(input_ids).to(device)
        attention_masks = torch.tensor(attention_masks).to(device)
        extra_features = torch.tensor(extra_features).to(device)

        return input_ids, attention_masks, extra_features

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.df['score'][idx], self.attention_masks[idx], self.extra_features[idx]
        
#dataset = EssayDataset(df, tokenizer, model_embed)
#dataloader = DataLoader(dataset, batch_size=8, shuffle=True)



In [10]:


#model = TransformerScorer(model_name)
#model = nn.DataParallel(model)
#model = model.to(device)

#criterion = nn.MSELoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)


In [11]:
from sklearn.metrics import confusion_matrix as sk_confusion_matrix

def quadratic_weighted_kappa(y_true, y_pred):
    """
    Calculates the Quadratic Weighted Kappa
    y_true: array, true labels
    y_pred: array, predicted labels
    """
    o = sk_confusion_matrix(y_true, y_pred)
    n = o.sum()
    row_sums = o.sum(axis=1)
    col_sums = o.sum(axis=0)
    
    expected = np.outer(row_sums, col_sums) / n
    weight_matrix = np.zeros_like(o, dtype=float)
    num_ratings = o.shape[0]
    for i in range(num_ratings):
        for j in range(num_ratings):
            weight_matrix[i, j] = float((i - j) ** 2) / (num_ratings - 1) ** 2
            
    kappa = 1 - (np.sum(weight_matrix * o) / np.sum(weight_matrix * expected))
    return kappa
""" 
print("Strat training")
losses = []
mean_loss = 0
for epoch in range(100):
    running_loss = 0.0
    all_labels = []
    all_outputs_rounded = []
    
    for i, data in enumerate(dataloader, 0):
        
        inputs, labels, mask, features = data
        inputs = inputs.to(device)
        mask = mask.to(device)
        labels = labels.to(device)
        # inputs are in Size([2, 1, 512, 768]) but we need them in Size([2, 512, 768])
        inputs = inputs.squeeze(1)
        
        inputs = inputs.long()
        mask = mask.long()
        features = features.float()
        
        optimizer.zero_grad()
        outputs = model(inputs, mask, features)
        outputs = outputs.squeeze(1)
        outputs_rounded = outputs.detach().round().long()
        outputs_rounded = torch.clamp(outputs_rounded, min=0, max=6)
        
        all_labels.append(labels.cpu().detach().numpy())
        all_outputs_rounded.append(outputs_rounded.cpu().detach().numpy())

        loss = criterion(outputs, labels.float())
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        if len(losses) > 20:
            mean_loss = np.mean(losses[-20:])
        if i % 20 == 0:
            print(f"Epoch {epoch + 1}, Batch {i + 1} loss: {loss.item()} mean : {mean_loss}")
            print(outputs_rounded)

    all_labels = np.concatenate(all_labels)
    all_outputs_rounded = np.concatenate(all_outputs_rounded)
    kappa = quadratic_weighted_kappa(all_labels, all_outputs_rounded)

    print(f"Epoch {epoch + 1} kappa: {kappa}")
            
print('Finished Training') """

' \nprint("Strat training")\nlosses = []\nmean_loss = 0\nfor epoch in range(100):\n    running_loss = 0.0\n    all_labels = []\n    all_outputs_rounded = []\n    \n    for i, data in enumerate(dataloader, 0):\n        \n        inputs, labels, mask, features = data\n        inputs = inputs.to(device)\n        mask = mask.to(device)\n        labels = labels.to(device)\n        # inputs are in Size([2, 1, 512, 768]) but we need them in Size([2, 512, 768])\n        inputs = inputs.squeeze(1)\n        \n        inputs = inputs.long()\n        mask = mask.long()\n        features = features.float()\n        \n        optimizer.zero_grad()\n        outputs = model(inputs, mask, features)\n        outputs = outputs.squeeze(1)\n        outputs_rounded = outputs.detach().round().long()\n        outputs_rounded = torch.clamp(outputs_rounded, min=0, max=6)\n        \n        all_labels.append(labels.cpu().detach().numpy())\n        all_outputs_rounded.append(outputs_rounded.cpu().detach().numpy()

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score

X = df.drop(columns=['score', 'full_text'])
X.set_index('essay_id', inplace=True)
y = df['score']
X.head()

Unnamed: 0_level_0,text_length,idf_feature,sentiment,unique_word_count,most_common_word_freq,topic_0,topic_1,topic_2,topic_3,topic_4,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
essay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000d118,0.588805,1.178515,0.904791,1.19792,-0.806105,0.0,0.0,0.280929,0.0,0.0,...,0.0,0.0,0.0,0.715285,0.0,0.0,0.0,0.0,0.0,0.0
000fe60,-0.72809,-0.749151,-0.204794,-0.68699,0.135805,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001ab80,1.01736,-0.913658,0.351432,0.876628,-0.806105,0.0,0.0,0.0,0.039235,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.945746,0.0,0.0,0.0
001bdc0,1.019095,0.016255,-0.004233,1.047984,0.449775,0.0,0.209764,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.748647,0.0,0.0,0.0,0.0,0.0
002ba53,0.267823,2.140191,-0.011619,-0.387118,-0.806105,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.994689,0.0


In [15]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import numpy as np

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define the parameter grid
param_grid = {
    'n_estimators': np.arange(100, 2000, 100),
    'learning_rate': np.linspace(0.001, 0.5, 100),
    'max_depth': np.arange(3, 10, 1),
    'colsample_bytree': np.linspace(0.3, 0.8, 6),
    'subsample': np.linspace(0.4, 1, 5),
    'gamma': np.linspace(0, 0.5, 5),
    'reg_alpha': np.linspace(0, 1, 10),
    'reg_lambda': np.linspace(0, 1, 10),
}

# Initialize an XGBoost Regressor
regressor = XGBRegressor(random_state=42)

# Initialize a RandomizedSearchCV object
random_search = RandomizedSearchCV(regressor, param_grid, n_iter=500, scoring='neg_mean_squared_error', cv=5, random_state=42, verbose=2, n_jobs=-1)

# Perform the random search
random_search.fit(X_train, y_train)

# Print the best parameters
print(random_search.best_params_)

# Now you can use regressor to make predictions

y_pred = random_search.predict(X_test)

outputs_rounded = y_pred.round()

kappa = quadratic_weighted_kappa(y_test, outputs_rounded)
kappa



Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END colsample_bytree=0.4, gamma=0.375, learning_rate=0.2681414141414141, max_depth=6, n_estimators=500, reg_alpha=0.0, reg_lambda=0.7777777777777777, subsample=0.55; total time=   2.3s
[CV] END colsample_bytree=0.4, gamma=0.375, learning_rate=0.2681414141414141, max_depth=6, n_estimators=500, reg_alpha=0.0, reg_lambda=0.7777777777777777, subsample=0.55; total time=   2.2s
[CV] END colsample_bytree=0.4, gamma=0.375, learning_rate=0.2681414141414141, max_depth=6, n_estimators=500, reg_alpha=0.0, reg_lambda=0.7777777777777777, subsample=0.55; total time=   2.3s
[CV] END colsample_bytree=0.4, gamma=0.375, learning_rate=0.2681414141414141, max_depth=6, n_estimators=500, reg_alpha=0.0, reg_lambda=0.7777777777777777, subsample=0.55; total time=   2.4s
[CV] END colsample_bytree=0.4, gamma=0.375, learning_rate=0.2681414141414141, max_depth=6, n_estimators=500, reg_alpha=0.0, reg_lambda=0.7777777777777777, subsample=0.55; total

0.7578972606845292

0.7248128327269463