Setting up the environment and loading data

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading transformers-4.55.0-py3-none-any.whl (11.3 MB)
   ---------------------------------------- 0.0/11.3 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.3 MB 3.1 MB/s eta 0:00:04
   ---- ----------------------------------- 1.3/11.3 MB 3.7 MB/s eta 0:00:03
   ------- -------------------------------- 2.1/11.3 MB 3.8 MB/s eta 0:00:03
   ---------- ----------------------------- 2.9/11.3 MB 3.8 MB/s eta 0:00:03
   ------------- -------------------------- 3.7/11.3 MB 3.9 MB/s eta 0:00:02
   --------

In [7]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import pipeline

In [8]:
df = pd.read_csv("C:\\Users\\Laptop World\\Desktop\\Sentiment Analysis\\twitter_training.csv")

In [10]:
df = pd.read_csv('C:\\Users\\Laptop World\\Desktop\\Sentiment Analysis\\twitter_training.csv', header=None, names=['tweet_id', 'entity', 'sentiment', 'text'])

In [11]:
print("أول 5 صفوف من البيانات:")
print(df.head())

أول 5 صفوف من البيانات:
   tweet_id       entity sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [12]:
print("\nمعلومات عن البيانات:")
print(df.info())


معلومات عن البيانات:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   74682 non-null  int64 
 1   entity     74682 non-null  object
 2   sentiment  74682 non-null  object
 3   text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
None


(Data Preprocessing)

In [13]:
# حذف الصفوف التي تحتوي على قيم مفقودة (NaN)
df.dropna(subset=['text', 'sentiment'], inplace=True)
print("\nحجم البيانات بعد حذف القيم المفقودة:", df.shape)


حجم البيانات بعد حذف القيم المفقودة: (73996, 4)


In [14]:
# تنظيف النصوص: حذف الرموز والروابط
def clean_text(text):
    text = str(text).lower() # تحويل النص إلى حروف صغيرة
    text = re.sub('\[.*?\]', '', text) # حذف أي شيء داخل أقواس مربعة
    text = re.sub('https?://\S+|www\.\S+', '', text) # حذف الروابط
    text = re.sub('<.*?>+', '', text) # حذف وسم HTML
    text = re.sub('\n', '', text) # حذف السطر الجديد
    text = re.sub('\w*\d\w*', '', text) # حذف الكلمات التي تحتوي على أرقام
    text = re.sub('[%s]' % re.escape(r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), '', text) # حذف علامات الترقيم
    return text

In [15]:
df['cleaned_text'] = df['text'].apply(clean_text)
print("\nأول 5 نصوص بعد التنظيف:")
print(df[['text', 'cleaned_text']].head())


أول 5 نصوص بعد التنظيف:
                                                text  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   

                                        cleaned_text  
0  im getting on borderlands and i will murder yo...  
1  i am coming to the borders and i will kill you...  
2  im getting on borderlands and i will kill you all  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands  and i will murder y...  


(TF-IDF + Logistic Regression)

In [16]:
# تحويل تصنيفات المشاعر إلى أرقام
sentiment_mapping = {'Positive': 2, 'Negative': 0, 'Neutral': 1, 'Irrelevant': 3}
df['sentiment_encoded'] = df['sentiment'].map(sentiment_mapping)

In [17]:
# تقسيم البيانات إلى مجموعتي تدريب واختبار
X = df['cleaned_text']
y = df['sentiment_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# تمثيل المتجهات باستخدام TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [19]:
# تدريب نموذج Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_vec, y_train)

In [21]:
# التقييم
y_pred_lr = model_lr.predict(X_test_vec)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lr, target_names=sentiment_mapping.keys()))


Classification Report:

              precision    recall  f1-score   support

    Positive       0.71      0.79      0.75      4380
    Negative       0.65      0.63      0.64      3605
     Neutral       0.68      0.74      0.71      4119
  Irrelevant       0.68      0.50      0.58      2696

    accuracy                           0.68     14800
   macro avg       0.68      0.66      0.67     14800
weighted avg       0.68      0.68      0.68     14800



Transformer (BERT)

In [51]:
!pip install torch transformers



In [49]:
# اختبار النموذج على بعض النصوص من البيانات
sample_texts = df['text'].head(5).tolist()

In [None]:
import streamlit as st
from transformers import pipeline

# عنوان التطبيق
st.title("Twitter Sentiment Analysis")
st.write("📊 Type any sentence and its sentiment will be analyzed (positive, negative, neutral)")

# تحميل الموديل من Hugging Face
@st.cache_resource
def load_model():
    model = pipeline("sentiment-analysis", model="bert-base-uncased")
    return model

model = load_model()

# إدخال النص من المستخدم
user_input = st.text_area("✏️ Type text here")

if st.button("Analysis"):
    if user_input.strip() != "":
        result = model(user_input)[0]
        st.write(f"**Classification:** {result['label']}")
        st.write(f"**Ratio:** {result['score']:.2f}")
    else:
        st.warning("⚠️ Please write a text first.ً")