# Arabic Text Preprocessing

### Install Required Libraries

In [None]:
!pip install pyarabic
!pip install qalsadi
!pip install camel-tools

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import word_tokenize
import qalsadi.lemmatizer as ql
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer

### Load the Dataset

In [None]:
# Load the dataset (ensure 'كأس_العالم#.csv' is in the same directory)
df = pd.read_csv('كأس_العالم#.csv')
df.head()

### Data Cleaning

Remove Unnecessary Columns

In [None]:
# Drop unnecessary columns if they exist
df.drop(['Unnamed: 0', 'time', 'name'], axis=1, inplace=True, errors='ignore')
df.head()

Handle Missing Values and Duplicates

In [None]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Remove rows with null values
df.dropna(inplace=True)

# Check for null values
df.isnull().sum()

### Text Preprocessing

Remove Mentions, Links, and Retweets

In [None]:
# Function to remove mentions
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

# Function to remove links
def remove_links(text):
    return re.sub(r'http[s]?://\S+|www\.\S+|:\S+\.co/\S+', '', text)

# Function to remove 'RT : ' from retweets
def remove_rt(text):
    return re.sub(r'^RT : ', '', text)

# Apply the functions
df['tweet'] = df['tweet'].apply(remove_mentions)
df['tweet'] = df['tweet'].apply(remove_links)
df['tweet'] = df['tweet'].apply(remove_rt)
df.head()

Remove Punctuations and English Text

In [None]:
# Define Arabic and English punctuations
arab_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arab_punctuations + english_punctuations

# Function to remove punctuations
def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

# Function to remove English text and digits
def remove_english_text(text):
    text = re.sub(r'[a-zA-Z0-9]+', '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join(text.split())
    return text

# Apply the functions
df['Cleaned_Text'] = df['tweet'].apply(remove_punctuations)
df['Cleaned_Text'] = df['Cleaned_Text'].apply(remove_english_text)
df.head()

Remove Duplicated Characters and Unwanted Words

In [None]:
# Function to remove duplicated characters
def remove_duplicated_chars(text):
    return re.sub(r'(.)\1{2,}', r'\1', text)

# List of unwanted words
unwanted_words = ['مونديال', 'ليونيل', 'ميسي', 'كرستيانو', 'رونالدو', 'الدون', 'هاتريك']

# Function to remove unwanted words
def remove_unwanted_words(text):
    pattern = r'\b(?:' + '|'.join(unwanted_words) + r')\b'
    return re.sub(pattern, '', text)

# Apply the functions
df['Cleaned_Text'] = df['Cleaned_Text'].apply(remove_duplicated_chars)
df['Cleaned_Text'] = df['Cleaned_Text'].apply(remove_unwanted_words)
df.head()

Remove Emojis

In [None]:
# Function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags
                           u"\U00002500-\U00002BEF"  # chinese characters
                           u"\U00002702-\U000027B0"
                           u"\U0001f926-\U0001f937"
                           u"\u200d"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"                 # dingbats
                           u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the function
df['Cleaned_Text'] = df['Cleaned_Text'].apply(remove_emojis)
df.head()

Normalize Arabic Text

In [None]:
# Function to normalize Arabic text
def normalize_arabic(text):
    text = re.sub(r'[\u064B-\u0652]', '', text)  # Remove diacritics
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'[يى]', 'ي', text)
    text = re.sub(r'[ؤ]', 'و', text)
    text = re.sub(r'[ئ]', 'ي', text)
    return text

# Apply the function
df['Cleaned_Text'] = df['Cleaned_Text'].apply(normalize_arabic)
df.head()

Tokenization

In [None]:
# Download NLTK punkt tokenizer
nltk.download('punkt')

# Function to tokenize Arabic text
def tokenize_arabic(text):
    tokens = word_tokenize(text)
    return tokens

# Apply the function
df['Cleaned_Text'] = df['Cleaned_Text'].apply(tokenize_arabic)
df.head()

Remove Arabic Stopwords

In [None]:
# Download Arabic stopwords
nltk.download('stopwords')

# Function to remove Arabic stopwords
def remove_arabic_stopwords(text):
    stop_words = set(stopwords.words('arabic'))
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text

# Apply the function
df['Cleaned_Text'] = df['Cleaned_Text'].apply(remove_arabic_stopwords)
df.head()

Lemmatization

In [None]:
# Function to lemmatize Arabic text
def lemmatize_arabic(text):
    lemmer = ql.Lemmatizer()
    lemmatized_words = [lemmer.lemmatize(word) for word in text]
    return lemmatized_words

# Apply the function
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lemmatize_arabic)
df.head()

Stemming

In [None]:
# Initialize ISRI Stemmer
stemmer = ISRIStemmer()

# Function to stem Arabic text
def stem_arabic(text):
    stemmed_tokens = [stemmer.stem(word) for word in text]
    return stemmed_tokens

# Apply the function
df['Cleaned_Text'] = df['Cleaned_Text'].apply(stem_arabic)
df.head()

Save Cleaned Data

In [None]:
# Save the cleaned data to a new CSV file
df.to_csv('clean_data.csv', index=False)