In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *

In [2]:
df1 = pd.read_csv('GVCEH Milestone 2 Labelling 1 - RawData.csv', usecols=['text', 'Relevant to Victoria', 'Relevant to Homelessness'])
df1 = df1.rename(columns={"Relevant to Victoria": "vic", "Relevant to Homelessness": "hl"})
df2 = pd.read_csv('GVCEH Milestone 2 Labelling 2 - RawData.csv', usecols=['text', 'Relevant to Victoria', 'Relevant to Homelessness'])
df2 = df2.rename(columns={"Relevant to Victoria": "vic", "Relevant to Homelessness": "hl"})
df1.head()

Unnamed: 0,vic,hl,text
0,Yes,No,@AnnaGreenwoodL1 @saanich Dawson Heights Housi...
1,No,No,It's Election Day and the polls are now open u...
2,No,No,Sidney Bulwer Michaelia Roger #ÂΩ©Á•® Bblythe Camp...
3,No,No,Me telling my parents I‚Äôm gonna spit on this o...
4,No,No,WRD Director Joy Langford shared water conserv...


In [3]:
df2.head()

Unnamed: 0,vic,hl,text
0,No,No,Inauguration of the new building for the Class...
1,No,No,#Dubai Destinations: Here is the easiest and m...
2,Yes,No,@fireladdyguy @otohp @bcliberals MY @saanich I...
3,Yes,No,@saanich @saanich if you can't collect garbage...
4,No,No,Pleased so many sensible and helpful recommend...


In [4]:
print(df1.shape)
df1.dropna(inplace=True)
df1 = df1[np.logical_and(df1.vic.isin(['Yes', 'No']), df1.hl.isin(['Yes', 'No']))]
df1.shape

(2116, 3)


(788, 3)

In [5]:
print(df2.shape)
df2.dropna(inplace=True)
df2 = df2[np.logical_and(df2.vic.isin(['Yes', 'No']), df2.hl.isin(['Yes', 'No']))]
df2.shape

(2688, 3)


(1219, 3)

In [6]:
df = pd.concat((df1, df2))
df.head()

Unnamed: 0,vic,hl,text
0,Yes,No,@AnnaGreenwoodL1 @saanich Dawson Heights Housi...
1,No,No,It's Election Day and the polls are now open u...
2,No,No,Sidney Bulwer Michaelia Roger #ÂΩ©Á•® Bblythe Camp...
3,No,No,Me telling my parents I‚Äôm gonna spit on this o...
4,No,No,WRD Director Joy Langford shared water conserv...


In [7]:
df.shape

(2007, 3)

In [8]:
df.vic.value_counts(), df.hl.value_counts()

(No     1595
 Yes     412
 Name: vic, dtype: int64,
 No     1808
 Yes     199
 Name: hl, dtype: int64)

In [9]:
label_map = dict(Yes=1, No=0)
label_map

{'Yes': 1, 'No': 0}

In [10]:
df.vic = df.vic.map(label_map)
df.hl = df.hl.map(label_map)

In [11]:
def clean_text(text):
  new_text = []
  for t in text.split(" "):
      # t = '@user' if t.startswith('@') and len(t) > 1 else t
      t = 'http' if t.startswith('http') else t
      new_text.append(t)
  return " ".join(new_text)

df.text = df.text.apply(clean_text)
df.text.iloc[:5]

0    @AnnaGreenwoodL1 @saanich Dawson Heights Housi...
1    It's Election Day and the polls are now open u...
2    Sidney Bulwer Michaelia Roger #ÂΩ©Á•® Bblythe Camp...
3    Me telling my parents I‚Äôm gonna spit on this o...
4    WRD Director Joy Langford shared water conserv...
Name: text, dtype: object

In [12]:
df = df.drop_duplicates()

In [13]:
df['label'] = np.logical_and(df.vic, df.hl)
df.label.value_counts()

False    1629
True      160
Name: label, dtype: int64

In [14]:
df = pd.concat((df[df.label == True], df[df.label == False].iloc[:300]))

In [15]:
df.hl.value_counts(), df.vic.value_counts()

(0    295
 1    165
 Name: hl, dtype: int64,
 0    273
 1    187
 Name: vic, dtype: int64)

In [16]:
from datasets import Dataset
dataset = Dataset.from_pandas(df
                              # , preserve_index=False #SF: Added this .. no luck
                             )

In [17]:
dataset

Dataset({
    features: ['vic', 'hl', 'text', 'label', '__index_level_0__'],
    num_rows: 460
})

In [18]:
dataset = dataset.class_encode_column('label')

Stringifying the column:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
dataset.features

{'vic': Value(dtype='int64', id=None),
 'hl': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['False', 'True'], id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [20]:
ds = dataset.train_test_split(test_size=0.25, 
                              shuffle=True,
                              stratify_by_column='label',
                              seed=42)

In [21]:
ds['train'][0]

{'vic': 0,
 'hl': 0,
 'text': '@wahlstedt007 Goodmorning Sidney,  I\'m so glad you got to see him. Hopefully it won\'t be to much longer and he can go home. Tell him your twitter friends say "hello" ü§ó',
 'label': 0,
 '__index_level_0__': 519}

In [22]:
import torch 

In [23]:
torch. __version__

'1.13.1'

In [24]:
torch.cuda.is_available()

False

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
# ds = ds.with_format("torch", device=device)
# ds['train'][0]

device(type='cpu')

In [26]:
from setfit import SetFitModel

model_id = "cambridgeltl/tweet-roberta-base-embeddings-v1" # "sentence-transformers/paraphrase-mpnet-base-v2"  # 'cardiffnlp/twitter-roberta-base-sep2022'
model = SetFitModel.from_pretrained(model_id,
                                    use_differentiable_head=True, 
                                    head_params={"out_features": 1},
                                    # multi_target_strategy="multi-output",
                                    )

No sentence-transformers model found with name /Users/sheilaflood/.cache/torch/sentence_transformers/cambridgeltl_tweet-roberta-base-embeddings-v1. Creating a new one with MEAN pooling.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Change `out_features` from 1 to 2 since we use `CrossEntropyLoss` for binary classification.


In [27]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer

trainer = SetFitTrainer(
    model=model,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    loss_class=CosineSimilarityLoss,
    metric='f1',
    batch_size=64,
    num_epochs=50,
    num_iterations=20,
    use_amp=True,
    # unique_pairs=True, # experiemental
    column_mapping={"text": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)
trainer.model.model_body.max_seq_length = 64

In [28]:
# trainer.freeze() # Freeze the head
# trainer.train() # Train only the body

# Unfreeze the head and freeze the body -> head-only training
# trainer.unfreeze(keep_body_frozen=True)
# or
# Unfreeze the head and unfreeze the body -> end-to-end training
trainer.unfreeze(keep_body_frozen=False)
trainer.train(
    num_epochs=2, # The number of epochs to train the head or the whole model (body and head)
    max_length=64,
    batch_size=64,
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.0, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)

Applying column mapping to training dataset


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
metrics = trainer.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.9268292682926829}

In [30]:
test_x = np.array(ds['test']['text'])
test_y = np.array(ds['test']['label'])

In [31]:
preds = model(test_x).cpu().numpy()

In [32]:
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96        75
           1       0.90      0.95      0.93        40

    accuracy                           0.95       115
   macro avg       0.94      0.95      0.94       115
weighted avg       0.95      0.95      0.95       115



In [33]:
test_x[np.logical_and(preds == 1, test_y == 0)] # false positives

array(['@fairfield_iowa @snowmanandlila what is your problem. people who are unhoused don‚Äôt deserve dignity and communication in your eyes?',
       '@R35YYj @saanich From BC Local Govt:\nhttps://t.co/6ycNodJTEn\n\nStaff briefings, to further elected officials understanding of an issue, that do not constitute a material part of the decision-making process, would not typically be considered a municipal council or regional district board meeting.',
       '@RTodKelly An ambitious family unit in Tropical Canada  , I am poor and disabled but this kind of  world is just a few kms away.    The Gorge when I came for university could not even be swum in. It got cleaned up and in the last step of is as good  a specific type of oyster is back. http',
       '@TalktoARYZE @Magnatiles @LeonPlett I see 33 affordable rental units right there... breath life into this, and build it on Oak Bay Ave. Dibs 23rd floor micro loft.\n\nYou could call it, The Needle... as an homage to repairing the tweed tha

In [34]:
test_x[np.logical_and(preds == 0, test_y == 1)] # false negatives

array(['@jorymicah US Government has always been this way.\nLike in Canada. Schools create economic slaves. Put people in categories they create. Including criminalized people. Racism and poor bashing leads in the ugly, currently making the enemy people who use drugs and struggle with mental health.',
       'RT @VicBuilders: Just sent a reno project to VRBA builders via our popular Expression of Interest http Try it for your new home or renovation! #Saanich #OakBay #Langford #CSaan #ViewRoyal #Colwood #Sooke #NSaan #Esquimalt #Metchosin #HighlandsBC #VictoriaBC #‚Ä¶'],
      dtype='<U571')

In [36]:
PATH = '/Users/sheilaflood/SWB-GVCEH/models/relevance_model/rel_model.csv'
torch.save(trainer, PATH)

In [37]:
the_model = torch.load(PATH)

In [38]:
metrics = the_model.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.9268292682926829}

array(['Gonzales said his number one priority is curbing domestic violence. He also wants to continue progressive policies such as cite and release. #SanAntonio #SATX #SanAntonioTX #bexarcounty #2022Elections #Midterms2022\n\nhttps://t.co/jaFULOfteJ',
       '@ErikReed Holiday Inn Times Square on 8th Ave is really close to subways. The Westin Times Square is great &amp; close to subways too. Crowne Plaza HY 36 midtown is a nice &amp; typically affordable hotel, as is Double Tree Times Sq South in 8th Ave &amp; Fairfield Inn &amp; Suites Midtown Penn Station',
       '"the region needs rentals, townhouses, duplexes, triplexes and beyond to help ensure that it does not face another cycle of rapid price increases due to the lack of future inventory." http #Saanich #oakbay #csaan #colwood #viewroyal #langford #sidney #sooke',
       "Hospitality students from Camosun College dished out hot lunches to people at Victoria's Our Place Society on Thursday.  http",
       'Darien vs Ludlowe ~ Hi

In [43]:
array_filtered = test_x[preds == 1] # data to keep
df_filtered = pd.DataFrame(array_filtered, columns = ['Text'])
df_filtered

Unnamed: 0,Text
0,"""the region needs rentals, townhouses, duplexe..."
1,Hospitality students from Camosun College dish...
2,@dharmabrat @IslandFamily5 @d_brokenshire @Glo...
3,"""58% of Canadians believe that homelessness is..."
4,We just donated to Greater Victoria Coalition ...
5,Only 16% get that Government pol...
6,@Covid_Stinks Why would someone who lives in S...
7,@nsun_victoria @suziezed I‚Äôd be curious to kno...
8,"For those curious, most important Greater Vict..."
9,"Some 'light' research underway, prompted by a ..."
