In [11]:
import re
import string
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import requests

#from google.colab import drive
#drive.mount('/content/drive')
#train_path = '/content/drive/MyDrive/customer_service/train.csv'
#test_path = '/content/drive/MyDrive/customer_service/test.csv'

train_url = 'https://raw.githubusercontent.com/Onatparagus/DI725_Assignment1/refs/heads/main/starter_code/data/customer_service/train.csv'
test_url = 'https://raw.githubusercontent.com/Onatparagus/DI725_Assignment1/refs/heads/main/starter_code/data/customer_service/test.csv'

def download_file(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as file:
        file.write(response.content)
download_file(train_url, 'train.csv')
download_file(test_url, 'test.csv')

train_df = pd.read_csv('train.csv', delimiter=",")
test_df = pd.read_csv('test.csv', delimiter=",")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  #remove extra spaces
    text = text.strip()  #remove leading spaces
    text = re.sub(r'[^\w\s]', '', text)  #punctuation
    return text

#cleaning to "conversation" column
train_df['cleaned_conversation'] = train_df['conversation'].apply(clean_text)
test_df['cleaned_conversation'] = test_df['conversation'].apply(clean_text)

def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt", return_attention_mask=True)

# tokenize "cleaned_conversation"
train_df['tokens'] = train_df['cleaned_conversation'].apply(lambda x: tokenize_function(x))
test_df['tokens'] = test_df['cleaned_conversation'].apply(lambda x: tokenize_function(x))

label_encoder = LabelEncoder()
#sentiment labels to integers
train_df['sentiment_label'] = label_encoder.fit_transform(train_df['customer_sentiment'])
test_df['sentiment_label'] = label_encoder.transform(test_df['customer_sentiment'])

#omit columns
train_df = train_df[['sentiment_label', 'tokens']]
test_df = test_df[['sentiment_label', 'tokens']]

#training and validation split
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=725, stratify=train_df['sentiment_label'])