In [None]:
# First install then restart the kernel
#using for training_args ImportError: Using the `Trainer` with `PyTorch` requires `accelerate`: Run `pip install --upgrade accelerate`
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Create sequence dataset for GPT2 from Sanjagh dataset

In [None]:
import pandas as pd 
import numpy as np

In [None]:
df1 = pd.read_csv ('/content/drive/MyDrive/Recommendation/taskfullshare.csv')
df1 = df1.loc[:, ~df1.columns.str.contains('^Unnamed')]
df1.head()

Unnamed: 0,Init,service,user
0,1655018492726,328,1275352
1,1655018824405,92,1275354
2,1655018831538,12,1275355
3,1655018842708,149,1274893
4,1655018856558,42,8604


convert Init to datetime

In [None]:
df1['Init'] = pd.to_datetime(df1['Init'],unit='ms')
df1=df1.sort_values(by='Init',ascending=False)
df1.head(2)

Unnamed: 0,Init,service,user
1681001,2023-04-09 18:23:57.300,395,89038
1681000,2023-04-09 18:23:01.152,42,216925


Create date and time columns for deleting the duplicated data where the users apply one service during one day

In [None]:
df1 = df1.sort_values(by='Init')
# Extract date and time components
df1['date'] = df1['Init'].dt.date
df1['time'] = df1['Init'].dt.time
df1.head()

Unnamed: 0,Init,service,user,date,time
40557,2017-06-13 11:00:31.553,0,127,2017-06-13,11:00:31.553000
40555,2017-06-13 12:49:36.030,0,130,2017-06-13,12:49:36.030000
40554,2017-06-13 13:04:06.644,0,132,2017-06-13,13:04:06.644000
40597,2017-06-13 13:24:43.878,0,130,2017-06-13,13:24:43.878000
40629,2017-06-14 07:35:55.062,0,134,2017-06-14,07:35:55.062000


remove duplicated

In [None]:
df1= df1.drop_duplicates(subset=['date', 'service', 'user'])
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482958 entries, 40557 to 1681000
Data columns (total 5 columns):
 #   Column   Non-Null Count    Dtype         
---  ------   --------------    -----         
 0   Init     1482958 non-null  datetime64[ns]
 1   service  1482958 non-null  int64         
 2   user     1482958 non-null  int64         
 3   date     1482958 non-null  object        
 4   time     1482958 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 67.9+ MB


In [None]:
df1=df1.drop(['Init','time'],axis=1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482958 entries, 40557 to 1681000
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   service  1482958 non-null  int64 
 1   user     1482958 non-null  int64 
 2   date     1482958 non-null  object
dtypes: int64(2), object(1)
memory usage: 45.3+ MB


change the type of the features to appropriate types for sequence

In [None]:
df1["service"] = df1["service"].apply(lambda x: f"S_{x}")
df1["user"] = df1["user"].apply(lambda x: f"U_{x}")
df1["date"] = df1["date"].apply(lambda x:  f"D_{x}")
df1.head()

Unnamed: 0,service,user,date
40557,S_0,U_127,D_2017-06-13
40555,S_0,U_130,D_2017-06-13
40554,S_0,U_132,D_2017-06-13
40629,S_0,U_134,D_2017-06-14
40553,S_4,U_134,D_2017-06-14


In [None]:
ratings_group = df1.sort_values(by=["date"]).groupby("user")

ratings_data = pd.DataFrame(
    data={
        "user": list(ratings_group.user.apply(list)),
        "service": list(ratings_group.service.apply(list)),
        "date": list(ratings_group.date.apply(list)),
    }
)

Make sliding window that contains four sequence item in each row 

In [None]:
sequence_length = 4
step_size = 3


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.service = ratings_data.service.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.date = ratings_data.date.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)
ratings_data.user = ratings_data.user.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)



In [None]:
ratings_data.head(2)

Unnamed: 0,user,service,date
0,"[[U_0, U_0, U_0, U_0], [U_0, U_0, U_0, U_0], [...","[[S_155, S_54, S_54, S_25], [S_25, S_15, S_95,...","[[D_2017-12-01, D_2018-01-04, D_2018-01-06, D_..."
1,"[[U_1, U_1, U_1, U_1], [U_1, U_1, U_1, U_1], [...","[[S_79, S_4, S_92, S_92], [S_92, S_92, S_94, S...","[[D_2017-07-06, D_2018-01-26, D_2018-02-25, D_..."


In [None]:
# Find the rows with empty lists
ratings_data[ratings_data["service"].str.len() == 0]

Unnamed: 0,user,service,date
2,[],[],[]
3,[],[],[]
4,[],[],[]
5,[],[],[]
7,[],[],[]
...,...,...,...
426020,[],[],[]
426022,[],[],[]
426023,[],[],[]
426024,[],[],[]


In [None]:
# Remove rows with empty lists 
ratings_data=ratings_data[~ratings_data["service"].str.len().eq(0)]

In [None]:
ratings_data.head()

Unnamed: 0,user,service,date
0,"[[U_0, U_0, U_0, U_0], [U_0, U_0, U_0, U_0], [...","[[S_155, S_54, S_54, S_25], [S_25, S_15, S_95,...","[[D_2017-12-01, D_2018-01-04, D_2018-01-06, D_..."
1,"[[U_1, U_1, U_1, U_1], [U_1, U_1, U_1, U_1], [...","[[S_79, S_4, S_92, S_92], [S_92, S_92, S_94, S...","[[D_2017-07-06, D_2018-01-26, D_2018-02-25, D_..."
6,"[[U_1000022, U_1000022, U_1000022, U_1000022],...","[[S_139, S_172, S_4, S_126], [S_126, S_52, S_8...","[[D_2021-09-08, D_2021-09-08, D_2021-09-08, D_..."
9,"[[U_100003, U_100003, U_100003, U_100003], [U_...","[[S_335, S_339, S_516, S_504], [S_516, S_504, ...","[[D_2019-02-05, D_2019-02-06, D_2019-11-12, D_..."
11,"[[U_1000040, U_1000040, U_1000040, U_1000040],...","[[S_20, S_20, S_46, S_87], [S_20, S_20, S_46, ...","[[D_2021-09-08, D_2021-09-13, D_2021-09-26, D_..."


In [None]:
ratings_data_item = ratings_data[["user", "service"]].explode([
    "service",'user'], ignore_index=True
)

ratings_data_rating = ratings_data[["date"]].explode("date", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_item, ratings_data_rating], axis=1)

ratings_data_transformed.service = ratings_data_transformed.service.str.join(',')

ratings_data_transformed.date = ratings_data_transformed.date.apply(
    lambda x: ",".join([str(v) for v in x])
)
ratings_data_transformed.user = ratings_data_transformed.user.apply(
    lambda x: ",".join([str(v) for v in x])
)

ratings_data_transformed.rename(
    columns={'user':'sequences_user',"service": "sequence_item_ids", "date": "sequence_Init"},
    inplace=True,
)


In [None]:
ratings_data_transformed

Unnamed: 0,sequences_user,sequence_item_ids,sequence_Init
0,"U_0,U_0,U_0,U_0","S_155,S_54,S_54,S_25","D_2017-12-01,D_2018-01-04,D_2018-01-06,D_2018-..."
1,"U_0,U_0,U_0,U_0","S_25,S_15,S_95,S_92","D_2018-01-07,D_2018-02-15,D_2018-02-25,D_2018-..."
2,"U_0,U_0,U_0,U_0","S_92,S_92,S_92,S_133","D_2018-02-27,D_2018-03-05,D_2018-03-13,D_2018-..."
3,"U_0,U_0,U_0,U_0","S_133,S_0,S_15,S_0","D_2018-03-15,D_2018-04-07,D_2018-04-16,D_2018-..."
4,"U_0,U_0,U_0,U_0","S_0,S_155,S_92,S_155","D_2018-04-18,D_2018-04-25,D_2018-05-03,D_2018-..."
...,...,...,...
321491,"U_99996,U_99996,U_99996,U_99996","S_44,S_55,S_20,S_44","D_2019-02-05,D_2019-02-10,D_2019-02-26,D_2020-..."
321492,"U_999978,U_999978,U_999978,U_999978","S_139,S_172,S_4,S_84","D_2021-09-08,D_2021-09-08,D_2021-12-22,D_2021-..."
321493,"U_999978,U_999978,U_999978,U_999978","S_139,S_172,S_4,S_84","D_2021-09-08,D_2021-09-08,D_2021-12-22,D_2021-..."
321494,"U_999988,U_999988,U_999988,U_999988","S_43,S_43,S_43,S_43","D_2021-09-08,D_2021-11-29,D_2021-11-30,D_2021-..."


Create target column with the last services that used in each rows

In [None]:
ratings_data_transformed['target']=ratings_data_transformed['sequence_item_ids'].apply(lambda sentence: sentence.split(',')[-1].split(',')[-1])
ratings_data_transformed.head()

Unnamed: 0,sequences_user,sequence_item_ids,sequence_Init,target
0,"U_0,U_0,U_0,U_0","S_155,S_54,S_54,S_25","D_2017-12-01,D_2018-01-04,D_2018-01-06,D_2018-...",S_25
1,"U_0,U_0,U_0,U_0","S_25,S_15,S_95,S_92","D_2018-01-07,D_2018-02-15,D_2018-02-25,D_2018-...",S_92
2,"U_0,U_0,U_0,U_0","S_92,S_92,S_92,S_133","D_2018-02-27,D_2018-03-05,D_2018-03-13,D_2018-...",S_133
3,"U_0,U_0,U_0,U_0","S_133,S_0,S_15,S_0","D_2018-03-15,D_2018-04-07,D_2018-04-16,D_2018-...",S_0
4,"U_0,U_0,U_0,U_0","S_0,S_155,S_92,S_155","D_2018-04-18,D_2018-04-25,D_2018-05-03,D_2018-...",S_155


Create sequences column and join three columns sequences_user,sequence_item_ids,sequence_Init as a sentence so each row contain 4 sentences with this format: user_0 used service_155 in date D_2018-01-06

In [None]:
ratings_data_transformed['sequence'] = ratings_data_transformed.apply(lambda row: ", ".join([f"{u} {s} {d}" for u, s, d in zip(row['sequences_user'].split(','), row['sequence_item_ids'].split(','), row['sequence_Init'].split(','))]), axis=1)


In [None]:
df=ratings_data_transformed.drop(['sequences_user','sequence_item_ids','sequence_Init'],axis=1)
df.head()

Unnamed: 0,target,sequence
0,S_25,"U_0 S_155 D_2017-12-01, U_0 S_54 D_2018-01-04,..."
1,S_92,"U_0 S_25 D_2018-01-07, U_0 S_15 D_2018-02-15, ..."
2,S_133,"U_0 S_92 D_2018-02-27, U_0 S_92 D_2018-03-05, ..."
3,S_0,"U_0 S_133 D_2018-03-15, U_0 S_0 D_2018-04-07, ..."
4,S_155,"U_0 S_0 D_2018-04-18, U_0 S_155 D_2018-04-25, ..."


In [None]:
df.to_csv('/content/drive/MyDrive/Recommendation/GPT2_Sequence_final1.csv')

In [None]:
df1=pd.read_csv('/content/drive/MyDrive/Recommendation/GPT2_Sequence_final.csv')
df1= df1.loc[:, ~df1.columns.str.contains('^Unnamed')]

df1.head()

# GPT2


## Install and import libraries for GPT2

In [66]:
!pip install transformers

In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments,GPT2Config, AdamW
import torch
from torch.utils.data import Dataset, DataLoader

In [68]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load GPT2 model and tokenizer

In [69]:
# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Initialize the GPT-2 model
model_name = 'DistilGPT2' 
modell = GPT2LMHeadModel.from_pretrained(model_name).to(device)

In [None]:
print(modell)

## Read dataset and create pytorch dataset and dataloader

In [None]:
df2=pd.read_csv('/content/drive/MyDrive/Recommendation/GPT2_Sequence_final1.csv')
df2= df2.loc[:, ~df2.columns.str.contains('^Unnamed')]

df2.head()

Unnamed: 0,target,sequence
0,S_25,"U_0 S_155 D_2017-12-01, U_0 S_54 D_2018-01-04,..."
1,S_92,"U_0 S_25 D_2018-01-07, U_0 S_15 D_2018-02-15, ..."
2,S_133,"U_0 S_92 D_2018-02-27, U_0 S_92 D_2018-03-05, ..."
3,S_0,"U_0 S_133 D_2018-03-15, U_0 S_0 D_2018-04-07, ..."
4,S_155,"U_0 S_0 D_2018-04-18, U_0 S_155 D_2018-04-25, ..."


In [None]:
# Split the data into training and validation sets
train_data = df2.sample(frac=0.8, random_state=42)
val_data = df2.drop(train_data.index)


### Try to solve the GPU limitation problem with generator

In [None]:
def sequence_generator(data):
    sequences = data['sequence'].tolist()
    targets = data['target'].tolist()
    for sequence, target in zip(sequences, targets):
        encoded_sequence = tokenizer.batch_encode_plus(
            sequence,
            truncation=True,
            padding='longest',
            max_length=140,
            return_tensors='pt'
        )
        yield {
            'input_ids': encoded_sequence['input_ids'].squeeze(),
            'attention_mask': encoded_sequence['attention_mask'].squeeze(),
            'targets': target
        }

In [None]:
train_generator1 = sequence_generator(train_data)
val_generator1 = sequence_generator(val_data)

In [None]:
class GPT2Dataset(Dataset):
    def __init__(self, data_generator, length):
        self.data_generator = data_generator
        self.length = length

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return next(self.data_generator)


In [None]:
train_dataset = GPT2Dataset(train_generator1, length=len(train_data))
val_dataset = GPT2Dataset(val_generator1, length=len(val_data))

## Training and generating

In [None]:
training_args = TrainingArguments(
    output_dir='./content/drive/MyDrive/Recommendation/GPT2_model',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=5000,
    # evaluation_strategy='epoch',
    eval_steps=1000,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
)


In [None]:
torch.cuda.empty_cache()

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }
    

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
optimizer = AdamW(modell.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
trainer = Trainer(
    model=modell,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)  
)




In [None]:
trainer.train()


Step,Training Loss
5000,0.4874
10000,0.4806
15000,0.4748
20000,0.4718
25000,0.4697
30000,0.4662
35000,0.4648
40000,0.4639
45000,0.4629
50000,0.4609


TrainOutput(global_step=64300, training_loss=0.46792622093093894, metrics={'train_runtime': 7649.065, 'train_samples_per_second': 33.625, 'train_steps_per_second': 8.406, 'total_flos': 7488570175266816.0, 'train_loss': 0.46792622093093894, 'epoch': 1.0})

# Save and load model

In [None]:
output_dir = '/content/drive/MyDrive/Recommendation/GPT2_model'  
trainer.save_model(output_dir)

In [71]:
# Load the model
model_G = GPT2LMHeadModel.from_pretrained(output_dir).to(device)


## Recommender system

In [72]:
gen_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 3.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

In [73]:

def recommend(text):
    input_tokens = tokenizer.encode_plus(text, truncation=True, padding=True, max_length=70, return_tensors="pt")

    input_ids = input_tokens['input_ids'].to(device)
    attention_mask = input_tokens['attention_mask'].to(device)

    generated_services = model_G.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=150,
        num_return_sequences=1,
        **gen_kwargs
    )

    generated_services = generated_services.to(device)

    recommended_services = tokenizer.decode(generated_services[0], skip_special_tokens=True)

    print('recommender system:',recommended_services)


In [74]:
text="U_0 S_155 D_2017-12-01"
recommend(text)

recommender system: U_0 S_155 D_2017-12-010, U_0S_4 D_2019-02-03, U_0 S_137 D_2019-02-07, U_0 S_137 D_2019-03-01, U_0 S_52 D_2019-05-22, U_0 S_19 D_2020-04-04, U_15600 S_9 D_2020-


In [75]:
text="U_127 S_155 D_2017-12-01"
recommend(text)

recommender system: U_127 S_155 D_2017-12-018, U_ 127S_133 D_2018-09-28, U_ 127S_16 D_2022-01-13, U_ 127S_55 D_2022-01-18, U_ 127S_121 D_2022-02-29, U_ 127S_4 D_2022-03-13, U_7 S_7 D_2023-03-24, U


In [78]:
text="U_1275352 S_155 D_2017-12-01"
recommend(text)

recommender system: U_1275352 S_155 D_2017-12-016, U_2528 S_73 D_2020-08-25, U_2528 S_171 D_2020-10-12, U_2528 S_171 D_2020-12-21, U_2528 S_73 D_2021-07-16, U_2528 S_15 D_2021-09-26, 


In [79]:
text="U_8604 S_155 D_2017-12-01"
recommend(text)

recommender system: U_8604 S_155 D_2017-12-017, U_86888 S_139 D_2019-01-29, U_86888 S_117 D_2019-02-14, U_86888 S_143 D_2019-02-15, U_86888 S_44 D_2019-02-23, U_86888 S_20 D_2019-02


In [81]:
text="U_134 S_155 D_2017-12-01"
recommend(text)

recommender system: U_134 S_155 D_2017-12-017, U_98497 S_143 D_2019-05-23, U_98497 S_46 D_2019-05-25, U_98497 S_337 D_2019-06-01, U_98497 S_509 D_2019-06-16, U_98497 S_92 D_2019-06-
