# Install and import necessary modules

In [None]:
!pip install simpletransformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m101.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.8 MB/s[0m et

In [None]:
import random
import pickle
import logging
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import torch
from simpletransformers.classification import ClassificationModel, ClassificationArgs

## Connection to colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Telecom/Airbus/Intimacy"

## Set seed and logger

In [None]:
seed = 56
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Load data

In [None]:
# Read train data
df = pd.read_csv("train.csv")
df = df.sample(frac=1, random_state=0)
train_texts = list(df.text.values)
train_labels = list(df.label.values)

# Read translated train data
with open('translated_train.pickle', 'rb') as f:
    tr_train_texts = pickle.load(f)

# Pre-processing

In [None]:
# Combine original and translated train texts
train_texts = [train_texts[i] + ' </s></s> ' + tr_train_texts[i] for i in range(len(train_texts))]

# Create train_df from combined train texts and labels
train_df = pd.DataFrame({"text": train_texts, "labels": train_labels})

# Split train_df into train, val, and test sets
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=0)

# Initialize model

In [None]:
epochs = 4
max_seq_length = 162
use_cuda = True

# Define model arguments
model_args = ClassificationArgs(
    num_train_epochs=epochs, overwrite_output_dir=True,
    no_save=False, max_seq_length=max_seq_length, regression=True
)

# Initialize the model
model = ClassificationModel(
    "xlmroberta", "cardiffnlp/twitter-xlm-roberta-base", args=model_args, use_cuda=use_cuda, num_labels=1
)

# other possibilities
#"xlmroberta", "xlm-roberta-base", args=model_args, use_cuda=True, num_labels=1
#"bert", "bert-base-multilingual-cased", args=model_args, use_cuda=True, num_labels=1

# Train and test the model

In [None]:
model.train_model(train_df)

# Evaluate on the test set
test_texts = list(test_df["text"].values)
test_labels = list(test_df["labels"].values)

predictions, _ = model.predict(test_texts)
r2 = r2_score(predictions, test_labels)
print(f"Test R2 Score: {r2}")


In [None]:
#access the hugging face model for saving
model.model.save_pretrained('model_intimacy')
model.tokenizer.save_pretrained('model_intimacy')
model.config.save_pretrained('model_intimacy/')