# Initialisation

## Packages

In [1]:
import pandas as pd
import numpy as np
import emoji
import os
from tqdm import tqdm
import pickle

from transformers import AutoTokenizer, AutoModel
from transformers import LlamaTokenizer, LlamaModel
import torch

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 19260817

# Data Wrangling

## Data Import

In [3]:
# read raw data
df1 = pd.read_excel("../data/raw/小助手数据采集.xlsx", sheet_name="Sheet1")
df2 = pd.read_excel("../data/raw/小助手数据采集.xlsx", sheet_name="Sheet2")
df3 = pd.read_excel("../data/raw/小助手数据采集.xlsx", sheet_name="Sheet3")

## Function Definition

In [4]:
def remove_emoji(text):
    if not isinstance(text, str):
        return text
    return emoji.replace_emoji(text, replace='')

## Preprocessing

In [5]:
# Formatting and renaming columns
df1.drop(0,inplace=True)
df2.drop(0,inplace=True)
df3.drop(0,inplace=True)

# renmae for concat
df1 = df1.rename(columns={"2021.3-2022.3": "Time"})
df2 = df2.rename(columns={"2022.4-2023.3": "Time"})
df3 = df3.rename(columns={"2023.4-2024.3": "Time"})

## Aggregation

In [6]:
# Aggregate
concated_df = pd.concat([df1, df2, df3], ignore_index=True)
concated_df

Unnamed: 0,Time,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,2021-03-01 22:55:00,2021semester1迎新会即将到来啦～想认识新朋友，了解学生会，一起云逛校园吗？迎新会...,【CSSA迎新会】2021全新启航，遇见每一个最“牛”的你！,1570,
1,2021-03-03 15:00:00,想了解更多CSSA活动和福利，加入CSSA大家庭吗？3月4日墨尔本时间12-2点，我们在Ho...,【CSSA 3.4号线上Oweek】2021第一场大爬梯：点击解锁你的大学生活！,711,
2,2021-03-04 12:00:00,拿CSSA会员小黑卡就能免费拿一杯酸奶⁉️这个羊毛不薅，更待何时。CSSA联合一杯酸牛奶为了...,【CSSA折扣商家】小黑卡福利，出示小黑卡酸奶免费领！,807,1
3,2021-03-14 13:00:00,CSSA福利招聘宣讲会👀 👀 👀，第一期我们邀请了斩获大厂offer 的Janet学姐，来为...,【CSSA福利宣讲会】福利招聘宣讲会开始报名啦！求职萌新速戳！,793,2
4,2021-03-16 13:00:00,【CSSA推荐】新东方🇬🇧英国研究生申请🙋\n👉澳洲本科学历如何申请英国研究生？\n👉申请剑...,【CSSA推荐】疫情下的澳大利亚&英国研究生申请 “新攻略”,689,1
...,...,...,...,...,...
231,2024-02-19 20:00:00,急急急⚠️让我看看是哪个墨大新生还没找到靠谱组织❓2.21号11:00-14:30的Owee...,【CSSA活动预告】 - 新生来报道！24年oweek火热来袭！,823,
232,2024-02-24 22:08:00,👋各位小伙伴😊快来参加✨3月1日17:00-19:00✨墨尔本大学中国学生会举办的迎新活动 ...,【CSSA迎新会】 跃龙门，向未来,1280,
233,2024-03-03 21:41:00,"🔜3月15号学签快到期，还全然不知？\n🙈忘记续签, 一不小心成【黑民】\n‼【续签】常见Q...",【CSSA推荐】签证提醒 | 马上将有一大波人签证到期，高频问题Q&A！,559,
234,2024-03-11 19:16:00,👏👏大家期待已久的3月折扣商家来喽！！开学季来啦，同学们是否都准备好了嘛！小伙伴们的生活用品...,【CSSA会员福利】全新人气商家入驻！3月折扣商家来啦～,348,


## Formatting

In [7]:
# rename columns of concated_df
renamed_df = concated_df.rename(columns={"Unnamed: 1": "PYQ_Text", "Unnamed: 2": "Title", "Unnamed: 3": "Views", "Unnamed: 4": "reposted"})
renamed_df

Unnamed: 0,Time,PYQ_Text,Title,Views,reposted
0,2021-03-01 22:55:00,2021semester1迎新会即将到来啦～想认识新朋友，了解学生会，一起云逛校园吗？迎新会...,【CSSA迎新会】2021全新启航，遇见每一个最“牛”的你！,1570,
1,2021-03-03 15:00:00,想了解更多CSSA活动和福利，加入CSSA大家庭吗？3月4日墨尔本时间12-2点，我们在Ho...,【CSSA 3.4号线上Oweek】2021第一场大爬梯：点击解锁你的大学生活！,711,
2,2021-03-04 12:00:00,拿CSSA会员小黑卡就能免费拿一杯酸奶⁉️这个羊毛不薅，更待何时。CSSA联合一杯酸牛奶为了...,【CSSA折扣商家】小黑卡福利，出示小黑卡酸奶免费领！,807,1
3,2021-03-14 13:00:00,CSSA福利招聘宣讲会👀 👀 👀，第一期我们邀请了斩获大厂offer 的Janet学姐，来为...,【CSSA福利宣讲会】福利招聘宣讲会开始报名啦！求职萌新速戳！,793,2
4,2021-03-16 13:00:00,【CSSA推荐】新东方🇬🇧英国研究生申请🙋\n👉澳洲本科学历如何申请英国研究生？\n👉申请剑...,【CSSA推荐】疫情下的澳大利亚&英国研究生申请 “新攻略”,689,1
...,...,...,...,...,...
231,2024-02-19 20:00:00,急急急⚠️让我看看是哪个墨大新生还没找到靠谱组织❓2.21号11:00-14:30的Owee...,【CSSA活动预告】 - 新生来报道！24年oweek火热来袭！,823,
232,2024-02-24 22:08:00,👋各位小伙伴😊快来参加✨3月1日17:00-19:00✨墨尔本大学中国学生会举办的迎新活动 ...,【CSSA迎新会】 跃龙门，向未来,1280,
233,2024-03-03 21:41:00,"🔜3月15号学签快到期，还全然不知？\n🙈忘记续签, 一不小心成【黑民】\n‼【续签】常见Q...",【CSSA推荐】签证提醒 | 马上将有一大波人签证到期，高频问题Q&A！,559,
234,2024-03-11 19:16:00,👏👏大家期待已久的3月折扣商家来喽！！开学季来啦，同学们是否都准备好了嘛！小伙伴们的生活用品...,【CSSA会员福利】全新人气商家入驻！3月折扣商家来啦～,348,


In [8]:
# fill null value with 0 in repetition
renamed_df["reposted"] = renamed_df["reposted"].fillna(0)

  renamed_df["reposted"] = renamed_df["reposted"].fillna(0)


In [9]:
# how to handle instances with null text?
renamed_df[renamed_df["PYQ_Text"].isna()]
renamed_df

Unnamed: 0,Time,PYQ_Text,Title,Views,reposted
0,2021-03-01 22:55:00,2021semester1迎新会即将到来啦～想认识新朋友，了解学生会，一起云逛校园吗？迎新会...,【CSSA迎新会】2021全新启航，遇见每一个最“牛”的你！,1570,0
1,2021-03-03 15:00:00,想了解更多CSSA活动和福利，加入CSSA大家庭吗？3月4日墨尔本时间12-2点，我们在Ho...,【CSSA 3.4号线上Oweek】2021第一场大爬梯：点击解锁你的大学生活！,711,0
2,2021-03-04 12:00:00,拿CSSA会员小黑卡就能免费拿一杯酸奶⁉️这个羊毛不薅，更待何时。CSSA联合一杯酸牛奶为了...,【CSSA折扣商家】小黑卡福利，出示小黑卡酸奶免费领！,807,1
3,2021-03-14 13:00:00,CSSA福利招聘宣讲会👀 👀 👀，第一期我们邀请了斩获大厂offer 的Janet学姐，来为...,【CSSA福利宣讲会】福利招聘宣讲会开始报名啦！求职萌新速戳！,793,2
4,2021-03-16 13:00:00,【CSSA推荐】新东方🇬🇧英国研究生申请🙋\n👉澳洲本科学历如何申请英国研究生？\n👉申请剑...,【CSSA推荐】疫情下的澳大利亚&英国研究生申请 “新攻略”,689,1
...,...,...,...,...,...
231,2024-02-19 20:00:00,急急急⚠️让我看看是哪个墨大新生还没找到靠谱组织❓2.21号11:00-14:30的Owee...,【CSSA活动预告】 - 新生来报道！24年oweek火热来袭！,823,0
232,2024-02-24 22:08:00,👋各位小伙伴😊快来参加✨3月1日17:00-19:00✨墨尔本大学中国学生会举办的迎新活动 ...,【CSSA迎新会】 跃龙门，向未来,1280,0
233,2024-03-03 21:41:00,"🔜3月15号学签快到期，还全然不知？\n🙈忘记续签, 一不小心成【黑民】\n‼【续签】常见Q...",【CSSA推荐】签证提醒 | 马上将有一大波人签证到期，高频问题Q&A！,559,0
234,2024-03-11 19:16:00,👏👏大家期待已久的3月折扣商家来喽！！开学季来啦，同学们是否都准备好了嘛！小伙伴们的生活用品...,【CSSA会员福利】全新人气商家入驻！3月折扣商家来啦～,348,0


## Export and read back in

In [10]:
renamed_df.to_csv("../data/curated/cleaned_df.csv", index=False)

In [11]:
df = pd.read_csv("../data/curated/cleaned_df.csv")
df

Unnamed: 0,Time,PYQ_Text,Title,Views,reposted
0,2021-03-01 22:55:00,2021semester1迎新会即将到来啦～想认识新朋友，了解学生会，一起云逛校园吗？迎新会...,【CSSA迎新会】2021全新启航，遇见每一个最“牛”的你！,1570,0
1,2021-03-03 15:00:00,想了解更多CSSA活动和福利，加入CSSA大家庭吗？3月4日墨尔本时间12-2点，我们在Ho...,【CSSA 3.4号线上Oweek】2021第一场大爬梯：点击解锁你的大学生活！,711,0
2,2021-03-04 12:00:00,拿CSSA会员小黑卡就能免费拿一杯酸奶⁉️这个羊毛不薅，更待何时。CSSA联合一杯酸牛奶为了...,【CSSA折扣商家】小黑卡福利，出示小黑卡酸奶免费领！,807,1
3,2021-03-14 13:00:00,CSSA福利招聘宣讲会👀 👀 👀，第一期我们邀请了斩获大厂offer 的Janet学姐，来为...,【CSSA福利宣讲会】福利招聘宣讲会开始报名啦！求职萌新速戳！,793,2
4,2021-03-16 13:00:00,【CSSA推荐】新东方🇬🇧英国研究生申请🙋\n👉澳洲本科学历如何申请英国研究生？\n👉申请剑...,【CSSA推荐】疫情下的澳大利亚&英国研究生申请 “新攻略”,689,1
...,...,...,...,...,...
231,2024-02-19 20:00:00,急急急⚠️让我看看是哪个墨大新生还没找到靠谱组织❓2.21号11:00-14:30的Owee...,【CSSA活动预告】 - 新生来报道！24年oweek火热来袭！,823,0
232,2024-02-24 22:08:00,👋各位小伙伴😊快来参加✨3月1日17:00-19:00✨墨尔本大学中国学生会举办的迎新活动 ...,【CSSA迎新会】 跃龙门，向未来,1280,0
233,2024-03-03 21:41:00,"🔜3月15号学签快到期，还全然不知？\n🙈忘记续签, 一不小心成【黑民】\n‼【续签】常见Q...",【CSSA推荐】签证提醒 | 马上将有一大波人签证到期，高频问题Q&A！,559,0
234,2024-03-11 19:16:00,👏👏大家期待已久的3月折扣商家来喽！！开学季来啦，同学们是否都准备好了嘛！小伙伴们的生活用品...,【CSSA会员福利】全新人气商家入驻！3月折扣商家来啦～,348,0


## Feature Engineering

In [12]:
# delete drop features
df.drop(columns='Time', inplace=True)
df

Unnamed: 0,PYQ_Text,Title,Views,reposted
0,2021semester1迎新会即将到来啦～想认识新朋友，了解学生会，一起云逛校园吗？迎新会...,【CSSA迎新会】2021全新启航，遇见每一个最“牛”的你！,1570,0
1,想了解更多CSSA活动和福利，加入CSSA大家庭吗？3月4日墨尔本时间12-2点，我们在Ho...,【CSSA 3.4号线上Oweek】2021第一场大爬梯：点击解锁你的大学生活！,711,0
2,拿CSSA会员小黑卡就能免费拿一杯酸奶⁉️这个羊毛不薅，更待何时。CSSA联合一杯酸牛奶为了...,【CSSA折扣商家】小黑卡福利，出示小黑卡酸奶免费领！,807,1
3,CSSA福利招聘宣讲会👀 👀 👀，第一期我们邀请了斩获大厂offer 的Janet学姐，来为...,【CSSA福利宣讲会】福利招聘宣讲会开始报名啦！求职萌新速戳！,793,2
4,【CSSA推荐】新东方🇬🇧英国研究生申请🙋\n👉澳洲本科学历如何申请英国研究生？\n👉申请剑...,【CSSA推荐】疫情下的澳大利亚&英国研究生申请 “新攻略”,689,1
...,...,...,...,...
231,急急急⚠️让我看看是哪个墨大新生还没找到靠谱组织❓2.21号11:00-14:30的Owee...,【CSSA活动预告】 - 新生来报道！24年oweek火热来袭！,823,0
232,👋各位小伙伴😊快来参加✨3月1日17:00-19:00✨墨尔本大学中国学生会举办的迎新活动 ...,【CSSA迎新会】 跃龙门，向未来,1280,0
233,"🔜3月15号学签快到期，还全然不知？\n🙈忘记续签, 一不小心成【黑民】\n‼【续签】常见Q...",【CSSA推荐】签证提醒 | 马上将有一大波人签证到期，高频问题Q&A！,559,0
234,👏👏大家期待已久的3月折扣商家来喽！！开学季来啦，同学们是否都准备好了嘛！小伙伴们的生活用品...,【CSSA会员福利】全新人气商家入驻！3月折扣商家来啦～,348,0


In [13]:
# remove emoji from title
df['Title_without_emoji'] = df['Title'].apply(remove_emoji)
df['PYQ_Text_without_emoji'] = df['PYQ_Text'].apply(remove_emoji)
df['reposted'] = df['reposted'].astype(int)
df.drop(columns=['PYQ_Text', 'Title'], inplace=True)
df

Unnamed: 0,Views,reposted,Title_without_emoji,PYQ_Text_without_emoji
0,1570,0,【CSSA迎新会】2021全新启航，遇见每一个最“牛”的你！,2021semester1迎新会即将到来啦～想认识新朋友，了解学生会，一起云逛校园吗？迎新会...
1,711,0,【CSSA 3.4号线上Oweek】2021第一场大爬梯：点击解锁你的大学生活！,想了解更多CSSA活动和福利，加入CSSA大家庭吗？3月4日墨尔本时间12-2点，我们在Ho...
2,807,1,【CSSA折扣商家】小黑卡福利，出示小黑卡酸奶免费领！,拿CSSA会员小黑卡就能免费拿一杯酸奶这个羊毛不薅，更待何时。CSSA联合一杯酸牛奶为了让大...
3,793,2,【CSSA福利宣讲会】福利招聘宣讲会开始报名啦！求职萌新速戳！,CSSA福利招聘宣讲会 ，第一期我们邀请了斩获大厂offer 的Janet学姐，来为我们介...
4,689,1,【CSSA推荐】疫情下的澳大利亚&英国研究生申请 “新攻略”,"【CSSA推荐】新东方英国研究生申请\n澳洲本科学历如何申请英国研究生？\n申请剑桥牛津,G..."
...,...,...,...,...
231,823,0,【CSSA活动预告】 - 新生来报道！24年oweek火热来袭！,急急急让我看看是哪个墨大新生还没找到靠谱组织2.21号11:00-14:30的Oweek活动...
232,1280,0,【CSSA迎新会】 跃龙门，向未来,各位小伙伴快来参加3月1日17:00-19:00墨尔本大学中国学生会举办的迎新活动 是不是想...
233,559,0,【CSSA推荐】签证提醒 | 马上将有一大波人签证到期，高频问题Q&A！,"3月15号学签快到期，还全然不知？\n忘记续签, 一不小心成【黑民】\n【续签】常见Q&A大..."
234,348,0,【CSSA会员福利】全新人气商家入驻！3月折扣商家来啦～,大家期待已久的3月折扣商家来喽！！开学季来啦，同学们是否都准备好了嘛！小伙伴们的生活用品有置...


In [14]:
# drop '\n' and '\t' in PYQ_Text_without_emoji and Small Title
df['PYQ_Text_without_emoji'] = df['PYQ_Text_without_emoji'].str.replace('\n', ' ')
df['PYQ_Text_without_emoji'] = df['PYQ_Text_without_emoji'].str.replace('\t', ' ')
df['Title_without_emoji'] = df['Title_without_emoji'].str.replace('\n', ' ')
df['Title_without_emoji'] = df['Title_without_emoji'].str.replace('\t', ' ')

# fill null PYQ Text with nan
df.fillna('', inplace=True)

df

Unnamed: 0,Views,reposted,Title_without_emoji,PYQ_Text_without_emoji
0,1570,0,【CSSA迎新会】2021全新启航，遇见每一个最“牛”的你！,2021semester1迎新会即将到来啦～想认识新朋友，了解学生会，一起云逛校园吗？迎新会...
1,711,0,【CSSA 3.4号线上Oweek】2021第一场大爬梯：点击解锁你的大学生活！,想了解更多CSSA活动和福利，加入CSSA大家庭吗？3月4日墨尔本时间12-2点，我们在Ho...
2,807,1,【CSSA折扣商家】小黑卡福利，出示小黑卡酸奶免费领！,拿CSSA会员小黑卡就能免费拿一杯酸奶这个羊毛不薅，更待何时。CSSA联合一杯酸牛奶为了让大...
3,793,2,【CSSA福利宣讲会】福利招聘宣讲会开始报名啦！求职萌新速戳！,CSSA福利招聘宣讲会 ，第一期我们邀请了斩获大厂offer 的Janet学姐，来为我们介...
4,689,1,【CSSA推荐】疫情下的澳大利亚&英国研究生申请 “新攻略”,"【CSSA推荐】新东方英国研究生申请 澳洲本科学历如何申请英国研究生？ 申请剑桥牛津,G5院..."
...,...,...,...,...
231,823,0,【CSSA活动预告】 - 新生来报道！24年oweek火热来袭！,急急急让我看看是哪个墨大新生还没找到靠谱组织2.21号11:00-14:30的Oweek活动...
232,1280,0,【CSSA迎新会】 跃龙门，向未来,各位小伙伴快来参加3月1日17:00-19:00墨尔本大学中国学生会举办的迎新活动 是不是想...
233,559,0,【CSSA推荐】签证提醒 | 马上将有一大波人签证到期，高频问题Q&A！,"3月15号学签快到期，还全然不知？ 忘记续签, 一不小心成【黑民】 【续签】常见Q&A大汇总..."
234,348,0,【CSSA会员福利】全新人气商家入驻！3月折扣商家来啦～,大家期待已久的3月折扣商家来喽！！开学季来啦，同学们是否都准备好了嘛！小伙伴们的生活用品有置...


### Train Test Split

In [15]:
# Train val test split based on IDs, because we want to use this id for different blocks of data (i.e. embeddings, labels)
    # 70% train, 15% val, 15% test
train_id, val_test_id = train_test_split(range(236), test_size=0.3, random_state=SEED)
val_id, test_id = train_test_split(val_test_id, test_size=0.5, random_state=SEED)

### Get labels

In [16]:
# get the 'view' for each split to get labels
train_views = df.iloc[train_id]['Views']
val_views = df.iloc[val_id]['Views']
test_views = df.iloc[test_id]['Views']

In [17]:
def get_oos_labels(sorted_train_labels_df: pd.DataFrame, oos_views: list) -> list:

    """ Transforms views into ranks based on the training set views and ranking """

    oos_label_list = []
    for oos_view in oos_views:

        for i in range(len(sorted_train_labels_df)):
            if sorted_train_labels_df.iloc[i]['views'] > oos_view:
                if oos_view == sorted_train_labels_df.iloc[i-1]['views']:
                    oos_label_list.append(sorted_train_labels_df.iloc[i-1]['label'])
                else:
                    oos_label_list.append((sorted_train_labels_df.iloc[i]['label'] + sorted_train_labels_df.iloc[i-1]['label'])/2)
                break

    return oos_label_list

In [18]:
# get label type 2 which is raw rank value from 0 to 1
train_labels_df = pd.DataFrame({'label': train_views.rank(pct=True), 'views': train_views}, index=train_id)
train_labels_df

sorted_train_labels_df = train_labels_df.sort_values(by='label')

val_labels_df = pd.DataFrame({'label': get_oos_labels(sorted_train_labels_df, val_views), 'views': val_views}, index=val_id)
test_labels_df = pd.DataFrame({'label': get_oos_labels(sorted_train_labels_df, test_views), 'views': test_views}, index=test_id)

train_labels = train_labels_df['label']
val_labels = val_labels_df['label']
test_labels = test_labels_df['label']

## Export

In [19]:
df.to_csv('../data/curated/df_engineered.csv', index=False)

train_labels_df.to_csv('../data/curated/train_labels.csv')
val_labels_df.to_csv('../data/curated/val_labels.csv')
test_labels_df.to_csv('../data/curated/test_labels.csv')

# Get embeddings

## Import Model
built for huggingface

In [20]:
MODEL = "google-bert/bert-base-chinese" # change to use different huggingface model

In [21]:
model_save_name = MODEL.replace("/", "_")

os.makedirs(f"../data/curated/{model_save_name}", exist_ok=True)

In [22]:
token = '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' #TODO: use cssa account to get new permanant token

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_auth_token=token)

# Load pre-trained LLaMA model and tokenizer
model = LlamaModel.from_pretrained(MODEL)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



You are using a model of type bert to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.
Some weights of LlamaModel were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.d

LlamaModel(
  (embed_tokens): Embedding(21128, 768, padding_idx=0)
  (layers): ModuleList(
    (0-11): 12 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=False)
        (o_proj): Linear(in_features=768, out_features=768, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=768, out_features=3072, bias=False)
        (up_proj): Linear(in_features=768, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=768, bias=False)
        (act_fn): GELUActivation()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)

## Define Functions

In [23]:
def get_embeddings(text_list: list) -> list:
    """ Converts a list of texts into embeddings using the LLaMA model """

    all_embeddings = []

    for text in tqdm(text_list):

        inputs = tokenizer(text, return_tensors='pt')
        inputs = {key: val.to(device) for key, val in inputs.items() if key != 'token_type_ids'}

        with torch.no_grad():
            outputs = model(**inputs)

        embeddings = outputs.last_hidden_state.cpu().numpy()
        all_embeddings.append(embeddings.mean(axis=1))

    return np.array(all_embeddings).squeeze()

In [24]:
def to_numpy_and_save(embeddings: list, filename: str):
    np.save(filename, embeddings)

## Get Embeddings

In [25]:
pyq_text = df['PYQ_Text_without_emoji'].values
title_text = df['Title_without_emoji'].values

In [26]:
pyq_text_embeddings = get_embeddings(pyq_text)
title_embeddings = get_embeddings(title_text)

  0%|          | 0/236 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


100%|██████████| 236/236 [00:35<00:00,  6.58it/s]
100%|██████████| 236/236 [00:25<00:00,  9.14it/s]


In [27]:
to_numpy_and_save(pyq_text_embeddings, f'../data/curated/{model_save_name}/pyq_text_embeddings.npy')
to_numpy_and_save(title_embeddings, f'../data/curated/{model_save_name}/title_embeddings.npy')

In [28]:
# split data into train, val, test
train_pyq_text_embeddings = pyq_text_embeddings[train_id]
train_title_embeddings = title_embeddings[train_id]
train_reposts = df.iloc[train_id]['reposted']

val_pyq_text_embeddings = pyq_text_embeddings[val_id]
val_title_embeddings = title_embeddings[val_id]
val_reposts = df.iloc[val_id]['reposted']

test_pyq_text_embeddings = pyq_text_embeddings[test_id]
test_title_embeddings = title_embeddings[test_id]
test_reposts = df.iloc[test_id]['reposted']

In [29]:
train_pyq_text_embeddings

array([[-1.0524526 , -0.46769485, -1.2005826 , ...,  0.7074122 ,
         0.64715666, -1.0549587 ],
       [-0.49512172,  0.13232645, -1.3600843 , ..., -0.5882755 ,
        -0.1787429 ,  0.09968549],
       [ 0.31487522, -0.22210331, -1.4995486 , ..., -0.68651456,
         0.03464007, -0.22756529],
       ...,
       [ 0.08745595,  0.29113987, -0.34062827, ...,  0.8530635 ,
         0.8422498 , -0.7430751 ],
       [ 0.20482461, -0.20028079, -1.2367352 , ...,  0.00574137,
         0.13669333, -0.1932376 ],
       [ 0.1987717 , -0.8489705 , -0.1770145 , ...,  2.0167353 ,
         0.3026738 , -1.2537236 ]], dtype=float32)

# Dimension Reduction

## Function Definition

In [30]:
def get_pca(embeddings: pd.DataFrame, n_components: int) -> PCA:
    """ Fit PCA and return """
    pca = PCA(n_components=n_components)
    pca.fit(embeddings)
    return pca

In [31]:
def get_dataframe(embeddings: np.array, feature_type: str, index: pd.Series) -> pd.DataFrame:
    """ Put embeddingsinto a dataframe and retain original index """
    return pd.DataFrame(embeddings, columns=[f'{feature_type}_{i}' for i in range(embeddings.shape[1])], index=index)

## Transform

In [32]:
# PCA Transform
pyq_text_pca = get_pca(train_pyq_text_embeddings, 32)
title_pca = get_pca(train_title_embeddings, 32)

train_pyq_text_embeddings_pca = pyq_text_pca.transform(train_pyq_text_embeddings)
train_title_embeddings_pca = title_pca.transform(train_title_embeddings)

val_pyq_text_embeddings_pca = pyq_text_pca.transform(val_pyq_text_embeddings)
val_title_embeddings_pca = title_pca.transform(val_title_embeddings)

test_pyq_text_embeddings_pca = pyq_text_pca.transform(test_pyq_text_embeddings)
test_title_embeddings_pca = title_pca.transform(test_title_embeddings)

In [33]:
# turn PCA transformed embeddings into dataframe
train_pyq_text_embeddings_pca_df = get_dataframe(train_pyq_text_embeddings_pca, 'pyq_text', train_id)
train_title_embeddings_pca_df = get_dataframe(train_title_embeddings_pca, 'title', train_id)

val_pyq_text_embeddings_pca_df = get_dataframe(val_pyq_text_embeddings_pca, 'pyq_text', val_id)
val_title_embeddings_pca_df = get_dataframe(val_title_embeddings_pca, 'title', val_id)

test_pyq_text_embeddings_pca_df = get_dataframe(test_pyq_text_embeddings_pca, 'pyq_text', test_id)
test_title_embeddings_pca_df = get_dataframe(test_title_embeddings_pca, 'title', test_id)

# Make Final Dataset and Output

In [35]:
# make final dataframe for model building, and export (both title and pyq text)
train_dataset_both = pd.concat([train_pyq_text_embeddings_pca_df, train_title_embeddings_pca_df, train_reposts], axis=1)
train_dataset_both['label'] = train_labels

val_dataset_both = pd.concat([val_pyq_text_embeddings_pca_df, val_title_embeddings_pca_df, val_reposts], axis=1)
val_dataset_both['label'] = val_labels

test_dataset_both = pd.concat([test_pyq_text_embeddings_pca_df, test_title_embeddings_pca_df, test_reposts], axis=1)
test_dataset_both['label'] = test_labels

train_dataset_both.to_csv(f'../data/curated/{model_save_name}/train_dataset_title_pyq.csv', index=False)
val_dataset_both.to_csv(f'../data/curated/{model_save_name}/val_dataset_title_pyq.csv', index=False)
test_dataset_both.to_csv(f'../data/curated/{model_save_name}/test_dataset_title_pyq.csv', index=False)

with open(f'../data/curated/{model_save_name}/pca_pyq_text.pkl', 'wb') as f:
    pickle.dump(pyq_text_pca, f)

In [36]:
# make final dataframe for model building, and export (only title)
train_dataset_title = pd.concat([train_title_embeddings_pca_df, train_reposts], axis=1)
train_dataset_title['label'] = train_labels

val_dataset_title = pd.concat([val_title_embeddings_pca_df, val_reposts], axis=1)
val_dataset_title['label'] = val_labels

test_dataset_title = pd.concat([test_title_embeddings_pca_df, test_reposts], axis=1)
test_dataset_title['label'] = test_labels

train_dataset_title.to_csv(f'../data/curated/{model_save_name}/train_dataset_title.csv', index=False)
val_dataset_title.to_csv(f'../data/curated/{model_save_name}/val_dataset_title.csv', index=False)
test_dataset_title.to_csv(f'../data/curated/{model_save_name}/test_dataset_title.csv', index=False)

with open(f'../data/curated/{model_save_name}/pca_title.pkl', 'wb') as f:
    pickle.dump(title_pca, f)