In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import *

In [2]:
df1 = pd.read_csv('GVCEH Milestone 2 Labelling 1 - RawData.csv', usecols=['text', 'Relevant to Victoria', 'Relevant to Homelessness'])
df1 = df1.rename(columns={"Relevant to Victoria": "vic", "Relevant to Homelessness": "hl"})
df2 = pd.read_csv('GVCEH Milestone 2 Labelling 2 - RawData.csv', usecols=['text', 'Relevant to Victoria', 'Relevant to Homelessness'])
df2 = df2.rename(columns={"Relevant to Victoria": "vic", "Relevant to Homelessness": "hl"})
df1.head()

Unnamed: 0,vic,hl,text
0,Yes,No,@AnnaGreenwoodL1 @saanich Dawson Heights Housi...
1,No,No,It's Election Day and the polls are now open u...
2,No,No,Sidney Bulwer Michaelia Roger #ÂΩ©Á•® Bblythe Camp...
3,No,No,Me telling my parents I‚Äôm gonna spit on this o...
4,No,No,WRD Director Joy Langford shared water conserv...


In [3]:
df2.head()

Unnamed: 0,vic,hl,text
0,No,No,Inauguration of the new building for the Class...
1,No,No,#Dubai Destinations: Here is the easiest and m...
2,Yes,No,@fireladdyguy @otohp @bcliberals MY @saanich I...
3,Yes,No,@saanich @saanich if you can't collect garbage...
4,No,No,Pleased so many sensible and helpful recommend...


In [4]:
print(df1.shape)
df1.dropna(inplace=True)
df1 = df1[np.logical_and(df1.vic.isin(['Yes', 'No']), df1.hl.isin(['Yes', 'No']))]
df1.shape

(2116, 3)


(788, 3)

In [5]:
print(df2.shape)
df2.dropna(inplace=True)
df2 = df2[np.logical_and(df2.vic.isin(['Yes', 'No']), df2.hl.isin(['Yes', 'No']))]
df2.shape

(2688, 3)


(1219, 3)

In [6]:
df = pd.concat((df1, df2))
df.head()

Unnamed: 0,vic,hl,text
0,Yes,No,@AnnaGreenwoodL1 @saanich Dawson Heights Housi...
1,No,No,It's Election Day and the polls are now open u...
2,No,No,Sidney Bulwer Michaelia Roger #ÂΩ©Á•® Bblythe Camp...
3,No,No,Me telling my parents I‚Äôm gonna spit on this o...
4,No,No,WRD Director Joy Langford shared water conserv...


In [7]:
df.shape

(2007, 3)

In [8]:
df.vic.value_counts(), df.hl.value_counts()

(No     1595
 Yes     412
 Name: vic, dtype: int64,
 No     1808
 Yes     199
 Name: hl, dtype: int64)

In [9]:
label_map = dict(Yes=1, No=0)
label_map

{'Yes': 1, 'No': 0}

In [10]:
df.vic = df.vic.map(label_map)
df.hl = df.hl.map(label_map)

In [11]:
def clean_text(text):
  new_text = []
  for t in text.split(" "):
      # t = '@user' if t.startswith('@') and len(t) > 1 else t
      t = 'http' if t.startswith('http') else t
      new_text.append(t)
  return " ".join(new_text)

df.text = df.text.apply(clean_text)
df.text.iloc[:5]

0    @AnnaGreenwoodL1 @saanich Dawson Heights Housi...
1    It's Election Day and the polls are now open u...
2    Sidney Bulwer Michaelia Roger #ÂΩ©Á•® Bblythe Camp...
3    Me telling my parents I‚Äôm gonna spit on this o...
4    WRD Director Joy Langford shared water conserv...
Name: text, dtype: object

In [12]:
df = df.drop_duplicates()

In [13]:
df['label'] = np.logical_and(df.vic, df.hl)
df.label.value_counts()

False    1629
True      160
Name: label, dtype: int64

In [14]:
df = pd.concat((df[df.label == True], df[df.label == False].iloc[:300]))

In [15]:
df.hl.value_counts(), df.vic.value_counts()

(0    295
 1    165
 Name: hl, dtype: int64,
 0    273
 1    187
 Name: vic, dtype: int64)

In [16]:
from datasets import Dataset
dataset = Dataset.from_pandas(df
                              # , preserve_index=False #SF: Added this .. no luck
                             )

In [17]:
dataset

Dataset({
    features: ['vic', 'hl', 'text', 'label', '__index_level_0__'],
    num_rows: 460
})

In [18]:
dataset = dataset.class_encode_column('label')

Stringifying the column:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
dataset.features

{'vic': Value(dtype='int64', id=None),
 'hl': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['False', 'True'], id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [20]:
ds = dataset.train_test_split(test_size=0.25, 
                              shuffle=True,
                              stratify_by_column='label',
                              seed=42)

In [21]:
ds['train'][0]

{'vic': 0,
 'hl': 0,
 'text': '@wahlstedt007 Goodmorning Sidney,  I\'m so glad you got to see him. Hopefully it won\'t be to much longer and he can go home. Tell him your twitter friends say "hello" ü§ó',
 'label': 0,
 '__index_level_0__': 519}

In [22]:
import torch 

In [23]:
torch. __version__

'1.13.1'

In [24]:
torch.cuda.is_available()

False

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
# ds = ds.with_format("torch", device=device)
# ds['train'][0]

device(type='cpu')

In [26]:
from setfit import SetFitModel

model_id = "cambridgeltl/tweet-roberta-base-embeddings-v1" # "sentence-transformers/paraphrase-mpnet-base-v2"  # 'cardiffnlp/twitter-roberta-base-sep2022'
model = SetFitModel.from_pretrained(model_id,
                                    use_differentiable_head=True, 
                                    head_params={"out_features": 1},
                                    # multi_target_strategy="multi-output",
                                    )

No sentence-transformers model found with name /Users/sheilaflood/.cache/torch/sentence_transformers/cambridgeltl_tweet-roberta-base-embeddings-v1. Creating a new one with MEAN pooling.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Change `out_features` from 1 to 2 since we use `CrossEntropyLoss` for binary classification.


In [27]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer

trainer = SetFitTrainer(
    model=model,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    loss_class=CosineSimilarityLoss,
    metric='f1',
    batch_size=64,
    num_epochs=50,
    num_iterations=20,
    use_amp=True,
    # unique_pairs=True, # experiemental
    column_mapping={"text": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)
trainer.model.model_body.max_seq_length = 64

In [28]:
# trainer.freeze() # Freeze the head
# trainer.train() # Train only the body

# Unfreeze the head and freeze the body -> head-only training
# trainer.unfreeze(keep_body_frozen=True)
# or
# Unfreeze the head and unfreeze the body -> end-to-end training
trainer.unfreeze(keep_body_frozen=False)
trainer.train(
    num_epochs=2, # The number of epochs to train the head or the whole model (body and head)
    max_length=64,
    batch_size=64,
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.0, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)

Applying column mapping to training dataset


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
metrics = trainer.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.9268292682926829}

In [30]:
test_x = np.array(ds['test']['text'])
test_y = np.array(ds['test']['label'])

In [31]:
preds = model(test_x).cpu().numpy()

In [32]:
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96        75
           1       0.90      0.95      0.93        40

    accuracy                           0.95       115
   macro avg       0.94      0.95      0.94       115
weighted avg       0.95      0.95      0.95       115



In [33]:
test_x[np.logical_and(preds == 1, test_y == 0)] # false positives

array(['@fairfield_iowa @snowmanandlila what is your problem. people who are unhoused don‚Äôt deserve dignity and communication in your eyes?',
       '@R35YYj @saanich From BC Local Govt:\nhttps://t.co/6ycNodJTEn\n\nStaff briefings, to further elected officials understanding of an issue, that do not constitute a material part of the decision-making process, would not typically be considered a municipal council or regional district board meeting.',
       '@RTodKelly An ambitious family unit in Tropical Canada  , I am poor and disabled but this kind of  world is just a few kms away.    The Gorge when I came for university could not even be swum in. It got cleaned up and in the last step of is as good  a specific type of oyster is back. http',
       '@TalktoARYZE @Magnatiles @LeonPlett I see 33 affordable rental units right there... breath life into this, and build it on Oak Bay Ave. Dibs 23rd floor micro loft.\n\nYou could call it, The Needle... as an homage to repairing the tweed tha

In [34]:
test_x[np.logical_and(preds == 0, test_y == 1)] # false negatives

array(['@jorymicah US Government has always been this way.\nLike in Canada. Schools create economic slaves. Put people in categories they create. Including criminalized people. Racism and poor bashing leads in the ugly, currently making the enemy people who use drugs and struggle with mental health.',
       'RT @VicBuilders: Just sent a reno project to VRBA builders via our popular Expression of Interest http Try it for your new home or renovation! #Saanich #OakBay #Langford #CSaan #ViewRoyal #Colwood #Sooke #NSaan #Esquimalt #Metchosin #HighlandsBC #VictoriaBC #‚Ä¶'],
      dtype='<U571')

In [36]:
PATH = '/Users/sheilaflood/SWB-GVCEH/models/relevance_model/rel_model.csv'
torch.save(trainer, PATH)

In [37]:
the_model = torch.load(PATH)

In [38]:
metrics = the_model.evaluate()
metrics

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.9268292682926829}

In [50]:
from huggingface_hub import notebook_login 
notebook_login()

Login successful
Your token has been saved to /Users/sheilaflood/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [55]:
# Save the model to hugging face repo 

# Push model to the Hub
trainer.push_to_hub("gvceh-setfit-rel-model2")

Cloning https://huggingface.co/sheesh021/gvceh-setfit-rel-model2 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/476M [00:00<?, ?B/s]

Upload file model_head.pkl: 100%|##########| 7.31k/7.31k [00:00<?, ?B/s]

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/sheesh021/gvceh-setfit-rel-model2
   5a490de..5d54400  main -> main



'https://huggingface.co/sheesh021/gvceh-setfit-rel-model2/commit/5d54400fc8a624ca882e80cfe8e420d9eda5d7ab'

In [56]:
# Download from Hub and run inference
model_2 = SetFitModel.from_pretrained("gvceh-setfit-rel-model2")

In [58]:
# Download from Hub and run inference
model2 = SetFitModel.from_pretrained("gvceh-setfit-rel-model2")

In [67]:
results = model2(["i loved the spiderman movie!","ok", "BC housing is terrible!", "Homelessness in Seattle", "Where do I go from here ooo ooo ooo"])

In [75]:
for i in test_x:
    print(i)
    print(model2([i]))
    print("-----------------------")

Gonzales said his number one priority is curbing domestic violence. He also wants to continue progressive policies such as cite and release. #SanAntonio #SATX #SanAntonioTX #bexarcounty #2022Elections #Midterms2022

https://t.co/jaFULOfteJ
tensor([0])
-----------------------
@ErikReed Holiday Inn Times Square on 8th Ave is really close to subways. The Westin Times Square is great &amp; close to subways too. Crowne Plaza HY 36 midtown is a nice &amp; typically affordable hotel, as is Double Tree Times Sq South in 8th Ave &amp; Fairfield Inn &amp; Suites Midtown Penn Station
tensor([0])
-----------------------
"the region needs rentals, townhouses, duplexes, triplexes and beyond to help ensure that it does not face another cycle of rapid price increases due to the lack of future inventory." http #Saanich #oakbay #csaan #colwood #viewroyal #langford #sidney #sooke
tensor([1])
-----------------------
Hospitality students from Camosun College dished out hot lunches to people at Victoria's O

tensor([0])
-----------------------
@KathyHochul @breakfastclubam You talk about preventing crime when your bill is the reason violent repeat offenders are walking the street every day. This scum was arrested 25 times and was free to rape at least 3 women we know of. How dare you talk about Zeldin! Defend YOUR record!
https://t.co/Q9IVDkzCUu
tensor([0])
-----------------------
"Adding affordable homes and more rentals is by far the best way to ‚Äúmake Greater Victoria an even greater place to live,‚Äù according to 44% of survey respondents‚Äîby far the most popular priority this year."

https://t.co/OzP8eUtYVG http
tensor([1])
-----------------------
üì¢Join us on November 17, 2022, from 12-1 pm to discuss the living wage and how we can make life more affordable in our region. 

üîóhttps://t.co/tWKaMgzH3n

@CSPC_Victoria @WorkerSol_BC @UmbrellaSociety @BigWheelBurger #uwsvi #unitedway #freeregistration #free http
tensor([1])
-----------------------
The new constituency will consist o

tensor([1])
-----------------------
Join us this weekend at the Annual Crime Prevention Summit where we'll join the North Newstead Association in discussing crime prevention in St. Louis! 
When: Saturday, November 12th from 9am-noon. 
Where: O‚ÄôFallon Park YMCA
FB: http
tensor([0])
-----------------------
We lost a seat in our newly drawn district. Sean Patrick Maloney was a flawed candidate for sure but NY Post &amp; other media along with his opponent Mark Lawler and other Republicans inundated Rockland County with fear of Crime Rockland is the 3rd safest in America. People are dumb
tensor([0])
-----------------------
Greater Victoria's Salvation Army kettle donation program is in crisis after this week's winter storm cancelled in-person donation sites.  http
tensor([1])
-----------------------
Well I guess it‚Äôs Central Park it is I‚Äôm addicted to them burgers üçî
tensor([0])
-----------------------
@DavidCFarebroth @ArmedCarp @MonicaCrowley Now 74 Court Cases lost. Lawyers have

tensor([1])
-----------------------
NORTH TEXAS CAMP &amp; SCRIMMAGE 
‚≠êÔ∏èCAMP‚≠êÔ∏è  ‚≠êÔ∏èCAMP‚≠êÔ∏è  ‚≠êÔ∏èCAMP‚≠êÔ∏è  ‚≠êÔ∏èCAMP‚≠êÔ∏è

ü•éCAMP INFORMATIONü•é
(Saturday Evening) November 12, 2022
North Lakes Park       
6:30pm - 9:45pm

North Lakes Park 
2001 W Windsor Dr
Denton, TX 76201

Camp Registration Link: http http
tensor([0])
-----------------------
@TheBackPackPro1 @SeeSpringVic @HomeForHope @nsun_victoria @HousingLogement @Make_TheShift 247 places of worship in YYJ according to the yellow pages, do any of them open their doors. Are any of the organizations who "help" the homeless voluntary opening their doors tonight.
tensor([1])
-----------------------
BC Housing Minister Ravi Kahlon  announces changed for the Residential Tenancy Branch to improve the dispute resolution process. Budget will be increased 40% and up to 50 new full-time employees will be hired, including   doubling size of the Compliance unit.  #bcpoli
tensor([1])
-----------------------
This is Canada

In [107]:
all_text = df.text
all_results = model2(list(all_text))

In [112]:
all_results = all_results.numpy()

In [117]:
sum(all_results), len(all_results)

(170, 460)

In [118]:
df_kept = df[all_results == 1] # data to keep

In [119]:
df_kept

Unnamed: 0,vic,hl,text,label
111,1,1,the old Hardee‚Äôs on Fairfield is such an eyeso...,True
423,1,1,"RT @VicBuilders: ""Goodmanson &amp; Szpak will ...",True
1279,1,1,SIGNIFICANT funds have been raised for Victori...,True
1282,1,1,Critical updates from Courtroom #201 - Region ...,True
1598,1,1,Housing First works! And here in Victoria HF p...,True
...,...,...,...,...
507,1,0,@R35YYj @saanich From BC Local Govt:\nhttps://...,False
521,0,0,Local Indigenous scholars and activists share ...,False
537,0,1,@ChloeMcMpls hmm how ignorant ur statements on...,False
551,0,0,NEW regional service available!üëç\n\nIf you‚Äôre ...,False


In [120]:
df_kept.label.value_counts()

True     154
False     16
Name: label, dtype: int64

In [121]:
16/(154 + 16)

0.09411764705882353

In [43]:
array_filtered = test_x[preds == 1] # data to keep
df_filtered = pd.DataFrame(array_filtered, columns = ['Text'])
df_filtered

Unnamed: 0,Text
0,"""the region needs rentals, townhouses, duplexe..."
1,Hospitality students from Camosun College dish...
2,@dharmabrat @IslandFamily5 @d_brokenshire @Glo...
3,"""58% of Canadians believe that homelessness is..."
4,We just donated to Greater Victoria Coalition ...
5,Only 16% get that Government pol...
6,@Covid_Stinks Why would someone who lives in S...
7,@nsun_victoria @suziezed I‚Äôd be curious to kno...
8,"For those curious, most important Greater Vict..."
9,"Some 'light' research underway, prompted by a ..."
