# Usecase 11
## Text classification 
Using [Sitfit](https://github.com/huggingface/setfit/tree/main)

## Import Libraries

In [1]:
from setfit import SetFitModel, Trainer, SetFitTrainer, TrainingArguments, sample_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import re
from datasets import Dataset
from sentence_transformers.losses import CosineSimilarityLoss
#! pip show transformers

## Load the Data

In [2]:
df = pd.read_csv('Data/ar_reviews_100k.tsv', delimiter='\t')

In [3]:
df.head(2)

Unnamed: 0,label,text
0,Positive,ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...
1,Positive,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...


In [4]:
df.shape

(99999, 2)

## Step 1:  Text Preprocessing

In [5]:
# remove Noise
df['text'] = df['text'].map(lambda x: re.sub(r'[\.\?\!\,،.\:\;\"]', '', x))

In [6]:
# Tokenization
df = df[df['label'].isin(['Positive', 'Negative'])]
df['text_length'] = df['text'].map(lambda a: len(a))
df = df[(df['text_length']<200) & (df['text_length']>10)]
df.shape

(41393, 3)

In [7]:
df.head(2)

Unnamed: 0,label,text,text_length
0,Positive,ممتاز نوعا ما النظافة والموقع والتجهيز والشاط...,55
4,Positive,ياسات جلوريا جزء لا يتجزأ من دبي فندق متكامل ...,72


In [8]:
df['label'].value_counts()

label
Positive    21316
Negative    20077
Name: count, dtype: int64

In [9]:
# shaping the data as model needs
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 41393
})

In [10]:
# Simulate the few-shot regime by sampling 8 examples per class
train_dataset = sample_dataset(dataset, label_column="label", num_samples=8)
eval_dataset = sample_dataset(dataset, label_column="label", num_samples=50)
test_dataset = sample_dataset(dataset, label_column="label", num_samples=50)

  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))
  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))
  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))


In [11]:
train_dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 16
})

In [12]:
eval_dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 100
})

In [13]:
test_dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 100
})

## Buliding the Model

In [14]:
# Load a SetFit model from Hub
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-mpnet-base-v2",
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [15]:
args = TrainingArguments(
    batch_size=16,
    num_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy")

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

## Train the Model

In [16]:
# Train and evaluate
trainer.train()

***** Running training *****
  Num unique pairs = 144
  Batch size = 16
  Num epochs = 4
  Total optimization steps = 36


Epoch,Training Loss,Validation Loss,Embedding Loss,Rate
1,No log,No log,0.2601,1.7e-05
2,No log,No log,0.2501,1.1e-05
3,No log,No log,0.2552,6e-06
4,No log,No log,0.2552,0.0


  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

  0%|          | 0/319 [00:00<?, ?it/s]

Loading best SentenceTransformer model from step 18.


In [17]:
model.save_pretrained("model2")

In [18]:
model_t = SetFitModel.from_pretrained("model2")          

## Test the Model

In [19]:
preds = model_t.predict(test_dataset['text'])
print(preds)

['Positive' 'Positive' 'Positive' 'Negative' 'Negative' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Negative' 'Negative'
 'Positive' 'Positive' 'Negative' 'Positive' 'Positive' 'Positive'
 'Negative' 'Positive' 'Positive' 'Positive' 'Negative' 'Negative'
 'Negative' 'Positive' 'Positive' 'Negative' 'Positive' 'Positive'
 'Positive' 'Negative' 'Positive' 'Positive' 'Positive' 'Negative'
 'Positive' 'Negative' 'Positive' 'Negative' 'Positive' 'Negative'
 'Positive' 'Negative' 'Positive' 'Negative' 'Positive' 'Positive'
 'Negative' 'Negative' 'Negative' 'Positive' 'Negative' 'Negative'
 'Negative' 'Positive' 'Positive' 'Negative' 'Negative' 'Negative'
 'Negative' 'Negative' 'Negative' 'Positive' 'Positive' 'Positive'
 'Negative' 'Negative' 'Positive' 'Positive' 'Negative' 'Positive'
 'Negative' 'Positive' 'Positive' 'Negative' 'Positive' 'Negative'
 'Negative' 'Positive' 'Positive' 'Negative' 'Negative' 'Negative'
 'Positive' 'Positive' 'Negative' 'Negative' 'Negative' 'Posit

In [49]:
test_df = test_dataset.to_pandas()
test_df['pred'] = preds

In [50]:
test_df

Unnamed: 0,label,text,text_length,__index_level_0__,pred
0,Positive,استثنائي. قربه من الوحدة مو.,28,30455,Positive
1,Negative,صدمت في منتجع اراك . الخصوصية نوعا ما. الاثاث ...,187,66974,Positive
2,Positive,جيد. الموقع و الهدوء و تنسيق المواعيد. الغساله...,63,30577,Positive
3,Negative,جميلة . باختصار الدنيا ظريفة فعلا,33,88379,Positive
4,Negative,اسوأ فندق . لااااااااا انصح به. كل شي,37,74309,Negative
...,...,...,...,...,...
95,Positive,لابأس في الفندق لمدة يوم . قريب للمطار. توزيع ...,53,13717,Negative
96,Negative,اسوء فندق نزلت فيه فى المدينه . بعيد جدا عن مد...,199,90734,Negative
97,Positive,خدمه ممتازه . خدمه ممتازه. فطور غير مكتمل و نس...,93,795,Positive
98,Positive,لا بأس . اعجبني الباص.. والبوفيه المفتوحوالموا...,131,10995,Negative


## Evaluating the Model 

In [44]:
# our benchmark model
base_model = round(df['label'].value_counts()[1]/df.shape[0]*100, 2)
base_model

  base_model = round(df['label'].value_counts()[1]/df.shape[0]*100, 2)


48.51

In [45]:
metrics = trainer.evaluate(test_dataset)

***** Running evaluation *****


In [51]:
metrics

{'accuracy': 0.76}