# Initialisation

## Packages

In [None]:
import pandas as pd
import numpy as np
import emoji
import os
from tqdm import tqdm
import pickle

from transformers import AutoTokenizer, AutoModel
from transformers import LlamaTokenizer, LlamaModel
import torch

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
SEED = 19260817

# Data Wrangling

## Data Import

In [None]:
# read raw data
df1 = pd.read_excel("../data/raw/小助手数据采集.xlsx", sheet_name="Sheet1")
df2 = pd.read_excel("../data/raw/小助手数据采集.xlsx", sheet_name="Sheet2")
df3 = pd.read_excel("../data/raw/小助手数据采集.xlsx", sheet_name="Sheet3")

## Function Definition

In [None]:
def remove_emoji(text):
    if not isinstance(text, str):
        return text
    return emoji.replace_emoji(text, replace='')

## Preprocessing

In [None]:
# Formatting and renaming columns
df1.drop(0,inplace=True)
df2.drop(0,inplace=True)
df3.drop(0,inplace=True)

# renmae for concat
df1 = df1.rename(columns={"2021.3-2022.3": "Time"})
df2 = df2.rename(columns={"2022.4-2023.3": "Time"})
df3 = df3.rename(columns={"2023.4-2024.3": "Time"})

## Aggregation

In [None]:
# Aggregate
concated_df = pd.concat([df1, df2, df3], ignore_index=True)
concated_df

## Formatting

In [None]:
# rename columns of concated_df
renamed_df = concated_df.rename(columns={"Unnamed: 1": "PYQ_Text", "Unnamed: 2": "Title", "Unnamed: 3": "Views", "Unnamed: 4": "reposted"})
renamed_df

In [None]:
# fill null value with 0 in repetition
renamed_df["reposted"] = renamed_df["reposted"].fillna(0)

In [None]:
# how to handle instances with null text?
renamed_df[renamed_df["PYQ_Text"].isna()]
renamed_df

## Export and read back in

In [None]:
renamed_df.to_csv("../data/curated/cleaned_df.csv", index=False)

In [None]:
df = pd.read_csv("../data/curated/cleaned_df.csv")
df

## Feature Engineering

In [None]:
# delete drop features
df.drop(columns='Time', inplace=True)
df

In [None]:
# remove emoji from title
df['Title_without_emoji'] = df['Title'].apply(remove_emoji)
df['PYQ_Text_without_emoji'] = df['PYQ_Text'].apply(remove_emoji)
df.drop(columns=['PYQ_Text', 'Title'], inplace=True)
df

In [None]:
# drop '\n' and '\t' in PYQ_Text_without_emoji and Small Title
df['PYQ_Text_without_emoji'] = df['PYQ_Text_without_emoji'].str.replace('\n', ' ')
df['PYQ_Text_without_emoji'] = df['PYQ_Text_without_emoji'].str.replace('\t', ' ')
df['Title_without_emoji'] = df['Title_without_emoji'].str.replace('\n', ' ')
df['Title_without_emoji'] = df['Title_without_emoji'].str.replace('\t', ' ')

df

In [None]:
# get label type 2 which is raw rank value from 0 to 1
df['QuantileLabel'] = df['Views'].rank(pct=True)
df

In [None]:
# fill null PYQ Text with nan
df.fillna('', inplace=True)

## Export

In [None]:
df.to_csv('../data/curated/df_engineered.csv', index=False)

# Get embeddings

## Import Model

In [None]:
MODEL = "shenzhi-wang/Llama3-8B-Chinese-Chat"

In [None]:
model_save_name = MODEL.replace("/", "_")

os.makedirs(f"../data/curated/{model_save_name}", exist_ok=True)

In [None]:
token = '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' #TODO: use cssa account to get new permanant token

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_auth_token=token)

# Load pre-trained LLaMA model and tokenizer
model = LlamaModel.from_pretrained('google-bert/bert-base-chinese')

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Define Functions

In [None]:
def get_embeddings(text_list: list) -> list:
    """ Converts a list of texts into embeddings using the LLaMA model """

    all_embeddings = []

    for text in tqdm(text_list):

        inputs = tokenizer(text, return_tensors='pt')
        inputs = {key: val.to(device) for key, val in inputs.items() if key != 'token_type_ids'}

        with torch.no_grad():
            outputs = model(**inputs)

        embeddings = outputs.last_hidden_state.cpu().numpy()
        all_embeddings.append(embeddings.mean(axis=1))

    return np.array(all_embeddings).squeeze()

In [None]:
def to_numpy_and_save(embeddings: list, filename: str):
    np.save(filename, embeddings)

## Get Embeddings

In [None]:
pyq_text = df['PYQ_Text_without_emoji'].values
title_text = df['Title_without_emoji'].values

In [None]:
pyq_text_embeddings = get_embeddings(pyq_text)
title_embeddings = get_embeddings(title_text)

In [None]:
to_numpy_and_save(pyq_text_embeddings, f'../data/curated/{model_save_name}/pyq_text_embeddings.npy')
to_numpy_and_save(title_embeddings, f'../data/curated/{model_save_name}/title_embeddings.npy')

# Train Test Split

In [None]:
train_id, val_test_id = train_test_split(range(236), test_size=0.3, random_state=SEED)
val_id, test_id = train_test_split(val_test_id, test_size=0.5, random_state=SEED)

In [None]:
train_pyq_text_embeddings = pyq_text_embeddings[train_id]
train_title_embeddings = title_embeddings[train_id]
train_labels = df['QuantileLabel'].values[train_id]

val_pyq_text_embeddings = pyq_text_embeddings[val_id]
val_title_embeddings = title_embeddings[val_id]
val_labels = df['QuantileLabel'].values[val_id]

test_pyq_text_embeddings = pyq_text_embeddings[test_id]
test_title_embeddings = title_embeddings[test_id]
test_labels = df['QuantileLabel'].values[test_id]

In [None]:
train_pyq_text_embeddings

# Dimension Reduction

## Function Definition

In [None]:
def get_pca(embeddings, n_components):
    pca = PCA(n_components=n_components)
    pca.fit(embeddings)
    return pca

In [None]:
def get_dataframe(embeddings: np.array, feature_type: str) -> pd.DataFrame:
    return pd.DataFrame(embeddings, columns=[f'{feature_type}_{i}' for i in range(embeddings.shape[1])])

## Transform

In [None]:
pyq_text_pca = get_pca(train_pyq_text_embeddings, 32)
title_pca = get_pca(train_title_embeddings, 32)

train_pyq_text_embeddings_pca = pyq_text_pca.transform(train_pyq_text_embeddings)
train_title_embeddings_pca = title_pca.transform(train_title_embeddings)

val_pyq_text_embeddings_pca = pyq_text_pca.transform(val_pyq_text_embeddings)
val_title_embeddings_pca = title_pca.transform(val_title_embeddings)

test_pyq_text_embeddings_pca = pyq_text_pca.transform(test_pyq_text_embeddings)
test_title_embeddings_pca = title_pca.transform(test_title_embeddings)

In [None]:
# turn them into dataframe
train_pyq_text_embeddings_pca_df = get_dataframe(train_pyq_text_embeddings_pca, 'pyq_text')
train_title_embeddings_pca_df = get_dataframe(train_title_embeddings_pca, 'title')

val_pyq_text_embeddings_pca_df = get_dataframe(val_pyq_text_embeddings_pca, 'pyq_text')
val_title_embeddings_pca_df = get_dataframe(val_title_embeddings_pca, 'title')

test_pyq_text_embeddings_pca_df = get_dataframe(test_pyq_text_embeddings_pca, 'pyq_text')
test_title_embeddings_pca_df = get_dataframe(test_title_embeddings_pca, 'title')

In [None]:
train_pyq_text_embeddings_pca_df

In [None]:
train_dataset_both = pd.concat([train_pyq_text_embeddings_pca_df, train_title_embeddings_pca_df], axis=1)
train_dataset_both['label'] = train_labels

val_dataset_both = pd.concat([val_pyq_text_embeddings_pca_df, val_title_embeddings_pca_df], axis=1)
val_dataset_both['label'] = val_labels

test_dataset_both = pd.concat([test_pyq_text_embeddings_pca_df, test_title_embeddings_pca_df], axis=1)
test_dataset_both['label'] = test_labels

train_dataset_both.to_csv(f'../data/curated/{model_save_name}/train_dataset_title_pyq.csv', index=False)
val_dataset_both.to_csv(f'../data/curated/{model_save_name}/val_dataset_title_pyq.csv', index=False)
test_dataset_both.to_csv(f'../data/curated/{model_save_name}/test_dataset_title_pyq.csv', index=False)

with open(f'../data/curated/{model_save_name}/pca_pyq_text.pkl', 'wb') as f:
    pickle.dump(pyq_text_pca, f)

In [None]:
train_dataset_title = pd.concat([train_title_embeddings_pca_df], axis=1)
train_dataset_title['label'] = train_labels

val_dataset_title = pd.concat([val_title_embeddings_pca_df], axis=1)
val_dataset_title['label'] = val_labels

test_dataset_title = pd.concat([test_title_embeddings_pca_df], axis=1)
test_dataset_title['label'] = test_labels

train_dataset_title.to_csv(f'../data/curated/{model_save_name}/train_dataset_title.csv', index=False)
val_dataset_title.to_csv(f'../data/curated/{model_save_name}/val_dataset_title.csv', index=False)
test_dataset_title.to_csv(f'../data/curated/{model_save_name}/test_dataset_title.csv', index=False)

with open(f'../data/curated/{model_save_name}/pca_title.pkl', 'wb') as f:
    pickle.dump(title_pca, f)