In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import torch
from transformers import CamembertModel, CamembertTokenizer
from tqdm import tqdm
from pathlib import Path

import csv
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from nltk.corpus import stopwords 

In [16]:
def text_embeddings(X, key='text', device='cuda'):
    tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-large")
    camembert = CamembertModel.from_pretrained("camembert/camembert-large")
    camembert.to(device)

    camembert.eval()
    encoded_features = []
    
    for i, sample in tqdm(enumerate(X[key])):
        tokenized_sentence = tokenizer.tokenize(sample)
        # 1-hot encode and add special starting and end tokens 
        encoded_sentence = tokenizer.encode(tokenized_sentence)
        # [5, 133, 22, 1250, 16, 12034, 14324, 81, 76, 6]
        encoded_sentence = torch.tensor(encoded_sentence).unsqueeze(0).to(device)
        encoded_features.append(camembert(encoded_sentence).last_hidden_state.squeeze().detach().cpu())
        torch.cuda.empty_cache()
        print(camembert(encoded_sentence)[0].shape, camembert(encoded_sentence)[1].shape)
        return 0
    # Feed tokens to Camembert as a torch tensor (batch dim 1)
    X[key] = encoded_features

In [14]:
data_name = 'camembert_embeddings.csv'

train_data = pd.read_csv(f"./retweet-prediction-challenge/{data_name}")
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.7, test_size=0.3, random_state=42)

X_train = X_train.drop(['retweets_count'], axis=1)
X_test = X_test.drop(['retweets_count'], axis=1)

In [9]:
max_s = 0
for i in tqdm(range(200000)):
    t = torch.load(f'/shared/personal/vladtom/camembert_embeddings/tensor{i}.pt')
    max_s = max(max_s, t.shape[0])
print(max_s)

  0%|                                                                                                        | 213/200000 [00:16<4:18:28, 12.88it/s]


KeyboardInterrupt: 

In [17]:
text_embeddings(X_train)

Some weights of the model checkpoint at camembert/camembert-large were not used when initializing CamembertModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
0it [00:00, ?it/s]

torch.Size([1, 228, 1024]) torch.Size([1, 1024])





0

In [3]:
reg = GradientBoostingRegressor()
reg.fit(X_train, y_train)

ValueError: could not convert string to float: 'tensor([[-0.4199,  0.4755,  0.2826,  ...,  0.1686,  0.0554, -0.0446],\n        [-0.1891, -0.0699, -0.0751,  ..., -0.2142,  0.2878,  0.1569],\n        [-0.0892,  0.2355,  0.1951,  ..., -0.0784, -0.0446,  0.3815],\n        ...,\n        [ 0.0657,  0.0208,  0.0318,  ...,  0.1672,  0.1030, -0.0978],\n        [ 0.0378, -0.0120,  0.2405,  ...,  0.0196,  0.1191, -0.0277],\n        [-0.8140, -0.3506, -0.4044,  ..., -0.4606,  0.3278, -0.2896]])'

In [None]:
text_transform(X_test)

In [None]:
y_pred = reg.predict(X_test)
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]

print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))

In [4]:


train_data = pd.read_csv("./retweet-prediction-challenge/train.csv")
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweets_count'], stratify=train_data['retweets_count'], train_size=0.7, test_size=0.3, random_state=42)

X_train = X_train.drop(['retweets_count'], axis=1)
X_test = X_test.drop(['retweets_count'], axis=1)

# You can examine the available features using X_train.head()
# X_train.head()

# We set up an Tfidf Vectorizer that will use the top 100 tokens from the tweets. We also remove stopwords.
# To do that we have to fit our training dataset and then transform both the training and testing dataset. 
vectorizer = TfidfVectorizer(max_features=100, stop_words=stopwords.words('french'))
X_train = vectorizer.fit_transform(X_train['text'])
X_test = vectorizer.transform(X_test['text'])

# Now we can train our model. Here we chose a Gradient Boosting Regressor and we set our loss function 
# reg = GradientBoostingRegressor() #reg = RandomForestRegressor() #
reg = LinearRegression()

# We fit our model using the training data
reg.fit(X_train, y_train)
# And then we predict the values for our testing set
y_pred = reg.predict(X_test)
# We want to make sure that all predictions are non-negative integers
y_pred = [int(value) if value >= 0 else 0 for value in y_pred]

print("Prediction error:", mean_absolute_error(y_true=y_test, y_pred=y_pred))

Prediction error: 26.9448446666855


Some weights of the model checkpoint at camembert/camembert-large were not used when initializing CamembertModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
237it [00:14, 16.51it/s]


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 15.74 GiB total capacity; 7.83 GiB already allocated; 5.31 MiB free; 7.83 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [22]:
X_train['text']

237569    1
278298    1
330283    1
294777    1
246705    1
         ..
85573     1
219726    1
276852    1
77441     1
82816     1
Name: text, Length: 247778, dtype: int64

In [13]:
from torch.utils.data import Dataset
class FolderDataset(Dataset):
    def __init__(self, folder):
        self.files = os.listdir(folder)
        self.folder = folder
    def __len__(self):
        return len(self.files)
    def __getitem__(self, idx):
        return torch.load(f"{self.folder}/{self.files[idx]}")
    
save_path = Path('/shared/personal/vladtom/camembert_embeddings')
ds = FolderDataset(save_path)

In [15]:
for i in range(10):
    print(ds[i].shape)

torch.Size([26, 1024])
torch.Size([16, 1024])
torch.Size([13, 1024])
torch.Size([22, 1024])
torch.Size([22, 1024])
torch.Size([31, 1024])
torch.Size([12, 1024])
torch.Size([13, 1024])
torch.Size([7, 1024])
torch.Size([15, 1024])


In [24]:
import glob
mylist = [f for f in glob.glob("*tokenize*", recursive=True)]
mylist

[]

In [None]:
# GradientBoostingRegressor Prediction error: 26.315271539019314