# **Libraries**

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import pandas as pd
import shutil
import re

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


# **Copy files from the Input directory to the Working directory "Output"**

In [2]:
shutil.copy("/kaggle/input/mawqif-dataset/Mawqif_AllTargets_Train.csv", "/kaggle/working/Mawqif_AllTargets_Train.csv")
shutil.copy("/kaggle/input/mawqif-dataset/StanceEval.py", "/kaggle/working/StanceEval.py")

'/kaggle/working/StanceEval.py'

# **Read dataset**

In [3]:
data = pd.read_csv("/kaggle/working/Mawqif_AllTargets_Train.csv")
data

Unnamed: 0,ID,text,target,stance,stance:confidence,against_reason,favor_reason,none_reason,sarcasm,sarcasm:confidence,sentiment,sentiment:confidence,datetime,Date
0,1,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترن...,Women empowerment,Against,0.5116,A_Explicit,,,No,1.0000,Negative,1.0000,2021-01-16 03:19:19+00:00,16/01/2021
1,3,روح حلل محد يم تطعيم كورونا شف الحرم البارح م...,Covid Vaccine,,0.4003,,,Not clear,Yes,0.5990,Neutral,0.6180,2022-04-28 11:12:56+00:00,28/04/2022
2,4,هذا ما يُعرّف بـ'فوبيا المرأة المُتمكنة' آفة ف...,Women empowerment,Favor,0.8171,,F_Explicit,,Yes,0.8145,Negative,0.8251,2022-04-02 07:45:42+00:00,02/04/2022
3,6,#LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في الم...,Digital Transformation,Favor,1.0000,,F_Explicit,,No,1.0000,Positive,0.7531,2022-02-02 18:24:09+00:00,02/02/2022
4,7,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,Digital Transformation,Favor,0.7559,,F_Explicit,,No,1.0000,Neutral,0.8116,2022-03-27 10:36:04+00:00,27/03/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,4117,يا جماعة الخير انا و اهلي حاشتنا كورونا خلصنا ...,Covid Vaccine,,0.7116,,,Not clear,No,1.0000,Positive,1.0000,2021-04-23 19:02:50+00:00,23/04/2021
3498,4118,للاسف بعض مدراء الدوائر من العقليات القديمة س...,Digital Transformation,Favor,1.0000,,F_Explicit,,No,1.0000,Positive,1.0000,2020-03-25 16:33:35+00:00,25/03/2020
3499,4119,ليس عصر تمكين المرأة . وإنما عصر تدمير الأسرة...,Women empowerment,Against,1.0000,A_Explicit,,,No,1.0000,Negative,1.0000,2020-12-26 03:06:40+00:00,26/12/2020
3500,4120,ياجماعه الخير هل يعقل اليوم جهة خدمية تذهب لا...,Digital Transformation,Favor,1.0000,,F_Explicit,,No,1.0000,Positive,1.0000,2021-08-01 07:11:02+00:00,01/08/2021


# **Normalisation**

In [4]:
import re

def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def replace_emojis(text):
    emojis = {
        "\U0001F601": "فرح",
        "\U0001F602": "فرح",
        "\U0001F603": "فرح",
        "\U0001F604": "فرح",
        "\U0001F605": "حزن",
        "\U0001F606": "فرح",
        "\U0001F607": "فرح",
        "\U0001F60C": "فرح",
        "\U0001F60D": "حب",
        "\U0001F60E": "حزن",
        "\U0001F611": "حب",
        "\U0001F612": "حزن",
        "\U0001F613": "حزن",
        "\U0001F615": "حزن",
        "\U0001F618": "حب",
        "\U0001F619": "حب",
        "\U0001F61A": "حب",
        "\U0001F61B": "حب",
        "\U0001F61C": "حب",
        "\U0001F61D": "حب",
        "\U0001F61E": "حب", 
        "\U0001F61F": "حب",
        "\U0001F620": "غضب",
        "\U0001F621": "غضب",
        "\U0001F622": "حزن",
        "\U0001F623": "فرح",
        "\U0001F624": "حزن",
        "\U0001F625": "حزن",
        "\U0001F626": "حزن",
        "\U0001F627": "غضب",
        "\U0001F628": "حزن",
        "\U0001F629": "حزن",
        "\U0001F62A": "حزن",
        "\U0001F62B": "حزن",
        "\U0001F62C": "حزن",
        "\U0001F62D": "حزن",
        "\U0001F62E": "حزن",
        "\U0001F62F": "حزن",
        "\U0001F630": "غضب",
        "\U0001F631": "حزن",
        "\U0001F632": "حزن",
        "\U0001F633": "حزن",
        "\U0001F634": "حزن وفرح",
        "\U0001F635": "حزن",
        "\U0001F636": "حزن",
        "\U0001F637": "فرح",
        "\U0001F638": "فرح",
        "\U0001F639": "فرح",
        "\U0001F63A": "فرح",
        "\U0001F63B": "فرح",
        "\U0001F63C": "فرح",
        "\U0001F63D": "فرح",
        "\U0001F63E": "فرح",
        "\U0001F63F": "فرح",
        "\U0001F640": "حزن",
        "\U0001F641": "حزن",
        "\U0001F642": "فرح",
        "\U0001F643": "فرح",
        "\U0001F644": "فرح",
        "\U0001F645": "فرح",
        "\U0001F646": "فرح",
        "\U0001F647": "فرح",
        "\U0001F648": "فرح",
        "\U0001F649": "فرح",
        "\U0001F64A": "فرح",
        "\U0001F64B": "فرح",
        "\U0001F64C": "فرح",
        "\U0001F64D": "فرح",
        "\U0001F64E": "فرح",
        "\U0001F64F": "فرح",
    }
    for emoji, arabic_equivalent in emojis.items():
        text = re.sub(emoji, arabic_equivalent, text)
    return text

# Applies normalization to texts
data['text'] = data['text'].apply(normalize_arabic)

# Replace emojis with their Arabic equivalent
data['text'] = data['text'].apply(replace_emojis)

print(data['text'])


0        عشان يلمع صورته ويعنني تمكين المراه ويصير ترن...
1        روح حلل محد يم تطعيم كورونا شف الحرم البارح م...
2       هذا ما يُعرّف بـ'فوبيا المراه المُتمكنه' افه ف...
3       #LEAP22  مءتمر يجمع اشهر وابرز المءثرين في الم...
4       خصوصيه البيانات وحمايتها في المنظمه مطلب ولكن ...
                              ...                        
3497    يا جماعه الخير انا و اهلي حاشتنا كورونا خلصنا ...
3498     للاسف بعض مدراء الدواءر من العقليات القديمه س...
3499     ليس عصر تمكين المراه . وانما عصر تدمير الاسره...
3500    ياجماعه الخير  هل يعقل اليوم جهه خدميه تذهب لا...
3501     التحول الالكتروني يقضي علي هذه الفوضي ويرتقي ...
Name: text, Length: 3502, dtype: object


# **Split data into training and testing sets (70% train, 30% test)**

In [5]:
textData = data['text'].astype(str)
stanceData = data['stance'].astype(str)

# Convert stances to uppercase
stanceData = [x.replace("nan", "None") for x in stanceData]
stanceData =[x.upper() for x in stanceData]  # Uppercase stances
print(data.head(5))

# Split data into training and testing sets (70% train, 30% test)
X_train, X_dev, y_train, y_dev = train_test_split(textData, stanceData, test_size=0.3, random_state=42)
test_ids = data.loc[X_dev.index, "ID"].tolist()  # Extract IDs for testing data
test_topics = data.loc[X_dev.index, "target"].tolist()  # Extract topics for testing data

   ID                                               text  \
0   1   عشان يلمع صورته ويعنني تمكين المراه ويصير ترن...   
1   3   روح حلل محد يم تطعيم كورونا شف الحرم البارح م...   
2   4  هذا ما يُعرّف بـ'فوبيا المراه المُتمكنه' افه ف...   
3   6  #LEAP22  مءتمر يجمع اشهر وابرز المءثرين في الم...   
4   7  خصوصيه البيانات وحمايتها في المنظمه مطلب ولكن ...   

                   target   stance  stance:confidence against_reason  \
0       Women empowerment  Against             0.5116     A_Explicit   
1           Covid Vaccine      NaN             0.4003            NaN   
2       Women empowerment    Favor             0.8171            NaN   
3  Digital Transformation    Favor             1.0000            NaN   
4  Digital Transformation    Favor             0.7559            NaN   

  favor_reason none_reason sarcasm  sarcasm:confidence sentiment  \
0          NaN         NaN      No              1.0000  Negative   
1          NaN   Not clear     Yes              0.5990   Neutral   
2 

In [6]:
# Load pre-trained model
model = SentenceTransformer("xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

# Define models and corresponding tokenizers
# "xlm-r-bert-base-nli-stsb-mean-tokens": This model is based on the XLM-RoBERTa architecture and is
# fine-tuned for various tasks, including semantic textual similarity. It's pre-trained on a multilingual
#  corpus, which includes Arabic.

# "xlm-r-100langs-bert-base-nli-stsb-mean-tokens": Another variant of the XLM-RoBERTa model pre-trained
# on a multilingual corpus.

# "arabic-bert-base-v2": Specifically trained for Arabic language understanding tasks, this model could
# be useful for Arabic text classification.

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# **Train Model**

In [7]:
# Transform text data to embeddings
X_train_embeddings = model.encode(X_train.tolist())
X_dev_embeddings = model.encode(X_dev.tolist())

# Train a classifier (Logistic Regression)
classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')

# Train the model
classifier.fit(X_train_embeddings, y_train)

Batches:   0%|          | 0/77 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

# **Evaluate the model**

In [8]:
from sklearn.metrics import classification_report

# Predict labels for test data
y_pred = classifier.predict(X_dev_embeddings)

report = classification_report(y_dev, y_pred, output_dict=True)

# Print classification report
print("Classification Report:")
print(classification_report(y_dev, y_pred))

# Save the predictions for evaluation
predictions_df = pd.DataFrame({'ID': test_ids, 'target': test_topics, 'stance': y_pred})
predictions_df.to_csv("/kaggle/working/predictions.csv", index=False)

Classification Report:
              precision    recall  f1-score   support

     AGAINST       0.60      0.68      0.63       296
       FAVOR       0.80      0.82      0.81       636
        NONE       0.43      0.25      0.32       119

    accuracy                           0.71      1051
   macro avg       0.61      0.58      0.59      1051
weighted avg       0.70      0.71      0.70      1051



In [9]:
# Write gold labels (IDs, topic, text, uppercase stance) to a separate file
with open("gold_labels.txt", "w", encoding="utf-8") as outfile:
    for i, (topic, text, stance) in enumerate(zip(test_topics, X_dev, y_dev)):
        outfile.write(f"{test_ids[i]}\t{topic}\t{text}\t{stance.upper()}\n")

# Write predicted labels (IDs, topic as target, text, prediction) to a separate file
with open("predictions.txt", "w", encoding="utf-8") as outfile:
    for i, (prediction, text) in enumerate(zip(y_pred.tolist(), X_dev)):
        outfile.write(f"{test_ids[i]}\t{test_topics[i]}\t{text}\t{prediction}\n")  # Use topic for target

print("Stance prediction results with IDs, topics, and uppercase stances saved to separate files.")

# Now run StanceEval.py
!python "/kaggle/working/StanceEval.py" gold_labels.txt predictions.txt > obtained_results.log

Stance prediction results with IDs, topics, and uppercase stances saved to separate files.


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
