# Usecase 11
## Text classification 
Using [Sitfit](https://github.com/huggingface/setfit/tree/main)

## Import Libraries

In [1]:
from setfit import SetFitModel, Trainer, SetFitTrainer, TrainingArguments, sample_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import re
from datasets import Dataset
from sklearn.metrics import f1_score
from sentence_transformers.losses import CosineSimilarityLoss
#! pip show transformers

## Load the Data

In [2]:
df = pd.read_csv('Data/ar_reviews_100k.tsv', delimiter='\t')

In [3]:
df.head(2)

Unnamed: 0,label,text
0,Positive,ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...
1,Positive,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...


In [4]:
df.shape

(99999, 2)

## Step 1:  Text Preprocessing

In [4]:
# remove Noise
df['text'] = df['text'].map(lambda x: re.sub(r'[\.\?\!\,،.\:\;\"]', '', x))

In [5]:
df = df[df['label'].isin(['Positive', 'Negative'])]
df.head(2)

Unnamed: 0,label,text
0,Positive,ممتاز نوعا ما النظافة والموقع والتجهيز والشاط...
1,Positive,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...


In [6]:
df['text_length'] = df['text'].map(lambda a: len(a))
df = df[(df['text_length']<200) & (df['text_length']>10)]
df.shape

(41393, 3)

In [7]:
df.head(2)

Unnamed: 0,label,text,text_length
0,Positive,ممتاز نوعا ما النظافة والموقع والتجهيز والشاط...,55
4,Positive,ياسات جلوريا جزء لا يتجزأ من دبي فندق متكامل ...,72


In [8]:
df['label'].value_counts()

label
Positive    21316
Negative    20077
Name: count, dtype: int64

In [9]:
# shaping the data as model needs
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 41393
})

In [10]:
# Simulate the few-shot regime by sampling 8 examples per class
train_dataset = sample_dataset(dataset, label_column="label", num_samples=8)
eval_dataset = sample_dataset(dataset, label_column="label", num_samples=50)
test_dataset = sample_dataset(dataset, label_column="label", num_samples=50)

  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))
  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))
  df = df.apply(lambda x: x.sample(min(num_samples, len(x)), random_state=seed))


In [11]:
train_dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 16
})

In [13]:
eval_dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 100
})

In [14]:
test_dataset

Dataset({
    features: ['label', 'text', 'text_length', '__index_level_0__'],
    num_rows: 100
})

## Buliding the Model

In [12]:
# Load a SetFit model from Hub
model = SetFitModel.from_pretrained(
    "sentence-transformers/paraphrase-mpnet-base-v2",
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [16]:
args = TrainingArguments(
    batch_size=16,
    num_epochs=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    metric="accuracy")

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

## Train the Model

In [None]:
# Train and evaluate
trainer.train()

***** Running training *****
  Num unique pairs = 144
  Batch size = 16
  Num epochs = 4
  Total optimization steps = 36


Epoch,Training Loss,Validation Loss


  0%|          | 0/319 [00:00<?, ?it/s]

In [17]:
model.save_pretrained("model3")

In [13]:
model_t = SetFitModel.from_pretrained("model2")          

## Test the Model

In [14]:
preds = model_t.predict(test_dataset['text'])
print(preds)

['Positive' 'Positive' 'Positive' 'Negative' 'Negative' 'Positive'
 'Positive' 'Positive' 'Positive' 'Positive' 'Negative' 'Negative'
 'Positive' 'Positive' 'Negative' 'Positive' 'Positive' 'Positive'
 'Negative' 'Positive' 'Positive' 'Positive' 'Negative' 'Negative'
 'Negative' 'Positive' 'Positive' 'Negative' 'Positive' 'Positive'
 'Positive' 'Negative' 'Positive' 'Positive' 'Positive' 'Negative'
 'Positive' 'Negative' 'Positive' 'Negative' 'Positive' 'Negative'
 'Positive' 'Negative' 'Positive' 'Negative' 'Positive' 'Positive'
 'Negative' 'Negative' 'Negative' 'Positive' 'Negative' 'Negative'
 'Negative' 'Positive' 'Positive' 'Negative' 'Negative' 'Negative'
 'Negative' 'Negative' 'Negative' 'Positive' 'Positive' 'Positive'
 'Negative' 'Negative' 'Positive' 'Positive' 'Negative' 'Positive'
 'Negative' 'Positive' 'Positive' 'Negative' 'Positive' 'Negative'
 'Negative' 'Positive' 'Positive' 'Negative' 'Negative' 'Negative'
 'Positive' 'Positive' 'Negative' 'Negative' 'Negative' 'Posit

In [15]:
test_df = test_dataset.to_pandas()
test_df['pred'] = preds

In [16]:
test_df

Unnamed: 0,label,text,text_length,__index_level_0__,pred
0,Positive,جيد بطيئين في تسجيل الدخول للفندق,34,23968,Positive
1,Negative,مطعم عادي جدا وينقصه الدعاية و الطباخين المهرة,48,97259,Positive
2,Positive,رواية رائعة من كاتب شديد التميز والواقعية متفر...,89,8013,Positive
3,Negative,فقط محتواها العلمي رائع ولن أنسى معلومة التصر...,121,66814,Negative
4,Negative,روايه ساذجة ده اكتر توصيف قادر يعبر عن احساسي...,189,96343,Negative
...,...,...,...,...,...
95,Positive,مكان مناسب كل شي ====,22,5180,Positive
96,Negative,ضعيف الإحترافيه صفر في تعامل موظفي الإستقبال ...,103,97510,Negative
97,Positive,لا زلت اعشق الصن داي ب الشوكلت,30,16434,Negative
98,Positive,لا يو جد المسبح مسبح حيث لا يوجد مصبح الموقع...,63,11415,Positive


## Evaluating the Model 

In [18]:
# our benchmark model
base_model = round(df['label'].value_counts()[1]/df.shape[0]*100, 2)
base_model

  base_model = round(df['label'].value_counts()[1]/df.shape[0]*100, 2)


48.5

In [19]:
# Calculate F1 score
f1 = f1_score(test_df['label'], test_df['pred'],  pos_label='Positive')

print("F1 Score:", f1)

F1 Score: 0.6666666666666666
