# Splitting interactions into train and test sets

To ensure all models are trained and evaluated on the same train and test data

In [11]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [4]:
df_interactions_sampled = pd.read_csv('df_interactions_reviews_5to15.csv')

In [5]:
df_interactions_sampled.shape

(176341, 7)

In [6]:
# Preprocessing
userid_encoder = LabelEncoder()
df_interactions_sampled['user_id'] = userid_encoder.fit_transform(df_interactions_sampled['user_id'])

In [7]:
df_interactions_sampled.head(3)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,481,47693,5,2012-06-05 16:34:47+00:00,,4508,0.574139
1,481,30118,4,2012-06-05 02:44:17+00:00,,4509,0.527973
2,481,144974,5,2012-06-05 02:33:59+00:00,,4509,0.574139


In [8]:
# Function to split train/test per user
def split_train_test(df, n_test=2):
    train_data = []
    test_data = []
    
    # Group by `user_id`
    for user_id, group in df.groupby('user_id'):
        test = group.tail(n_test)
        train = group.drop(test.index)
        
        test_data.append(test)
        train_data.append(train)
    
    # Concatenate the train and test sets from each user group
    train_df = pd.concat(train_data).reset_index(drop=True)
    test_df = pd.concat(test_data).reset_index(drop=True)
    
    return train_df, test_df

# Apply the split function
train_df, test_df = split_train_test(df_interactions_sampled, n_test=2)

In [9]:
print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)

Training data shape: (134745, 7)
Testing data shape: (41596, 7)


In [10]:
for user_id in range(5): 
    user_train = train_df[train_df['user_id'] == user_id]
    user_test = test_df[test_df['user_id'] == user_id]
    
    total_samples = len(user_train) + len(user_test)
    
    print(f"\nUser {user_id}:")
    print(f"  Train samples: {len(user_train)}")
    print(f"  Test samples: {len(user_test)}")
    print(f"  Train ratio: {len(user_train) / total_samples:.2f}")
    print(f"  Test ratio: {len(user_test) / total_samples:.2f}")


User 0:
  Train samples: 7
  Test samples: 2
  Train ratio: 0.78
  Test ratio: 0.22

User 1:
  Train samples: 10
  Test samples: 2
  Train ratio: 0.83
  Test ratio: 0.17

User 2:
  Train samples: 7
  Test samples: 2
  Train ratio: 0.78
  Test ratio: 0.22

User 3:
  Train samples: 12
  Test samples: 2
  Train ratio: 0.86
  Test ratio: 0.14

User 4:
  Train samples: 7
  Test samples: 2
  Train ratio: 0.78
  Test ratio: 0.22


# Export to CSV

In [145]:
train_df.to_csv('train_interactions.csv', index=False)

In [146]:
test_df.to_csv('test_interactions.csv', index=False)

## To check if there are missing books

In [148]:
df_books_final = pd.read_csv('df_books_final.csv')
books = df_books_final['book_id']

In [149]:
train_df.head(3)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,157993,5,2016-07-18 19:34:48+00:00,,3004,0.574139
1,0,359079,4,2014-07-16 19:28:57+00:00,,3737,0.527973
2,0,41684,4,2014-07-16 13:45:50+00:00,,3738,0.527973


In [150]:
interaction_books = train_df['book_id'].unique()

In [None]:
# Find book IDs in interaction_books that are not in books
missing_books = interaction_books[~np.isin(interaction_books, books)]

# Display the missing book IDs
print(len(missing_books))

0
