In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape text from a given URL
def scrape_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find and extract text content from relevant elements on the webpage
    text = ""
    for paragraph in soup.find_all('p'):
        text += paragraph.get_text() + "\n"
    return text.strip()

# Function to create the dataset
def create_dataset(urls, scores):
    texts = []
    for url in urls:
        texts.append(scrape_text(url))
    # Create DataFrame with text and scores
    df = pd.DataFrame({'Text (Arabic Language)': texts, 'Score': scores})
    return df

# URLs of Arabic news websites related to technology
urls = [
    'https://www.tech-wd.com/wd/2024/04/03/%d8%aa%d8%b3%d8%b1%d9%8a%d8%a8%d8%a7%d8%aa-%d8%aa%d8%b5%d9%85%d9%8a%d9%85-%d8%aa%d8%b7%d8%a8%d9%8a%d9%82-%d8%a7%d9%84%d9%83%d8%a7%d9%85%d9%8a%d8%b1%d8%a7-ios-18/',
    'https://www.elfann.com/news/show/1367468/%D8%A8%D8%B9%D8%AF-%D8%BA%D9%8A%D8%A7%D8%A8%D9%8D-%D8%A7%D9%84%D8%A3%D9%85%D9%8A%D8%B1%D8%A9-%D8%B4%D8%A7%D8%B1%D9%84%D9%8A%D9%86-%D9%85%D9%88%D9%86%D8%A7%D9%83%D9%88-%D8%AA%D8%AA%D8%A3%D9%84%D9%91%D9%82-%D8%A8%D8%AC%D9%85%D8%A8%D8%B3%D9%88%D8%AA-%D8%A5%D9%8A%D9%84',
 ' https://ar.wikipedia.org/wiki/%D8%A7%D9%84%D9%84%D8%BA%D8%A9_%D8%A7%D9%84%D8%B9%D8%B1%D8%A8%D9%8A%D8%A9',
   ' https://www.alarabiya.net/',
    'https://alarab.co.uk/',
    'https://www.dw.com/ar/%D8%A7%D9%84%D8%B1%D8%A6%D9%8A%D8%B3%D9%8A%D8%A9/s-9106',
    'https://www.araby.ai/',
    'https://www.france24.com/ar/',
    'https://www.skynewsarabia.com/'

]

# Relevance scores for each text (should be between 0 to 10)
scores = [10]* len(urls)

# Create the dataset
dataset = create_dataset(urls, scores)

# Save the dataset to a CSV file
dataset.to_csv('arabic_tech_news_dataset.csv', index=False, encoding='utf-8-sig')

print("Dataset created and saved successfully.")


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        # Tokenization
        tokens = word_tokenize(text)

        # Remove stop words
        stop_words = set(stopwords.words('arabic'))
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

        # Stemming
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]

        # Join tokens back into a single string
        preprocessed_text = ' '.join(lemmatized_tokens)

        return preprocessed_text
    else:
        return ''  # Return an empty string for non-string inputs


print("Preprocessing completed and preprocessed dataset saved successfully.")



In [None]:

import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import cross_val_score, LeaveOneOut

# Load preprocessed dataset
dataset = pd.read_csv('preprocessed_arabic_tech_news_dataset.csv')

# Split dataset into features and labels
texts = dataset['Preprocessed Text'].values
scores = dataset['Score'].values

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Define model architecture
def create_model():
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_seq_length))
    model.add(SimpleRNN(64, return_sequences=True))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    return model

# Define hyperparameters
learning_rate = 0.001
batch_size = 32
epochs = 10

# Leave-One-Out cross-validation
loo = LeaveOneOut()

# Perform cross-validation
mse_scores = []
for train_index, test_index in loo.split(texts):
    # Split data into training and testing sets
    train_texts, test_texts = texts[train_index], texts[test_index]
    train_scores, test_scores = scores[train_index], scores[test_index]

    # Tokenize and pad sequences
    train_sequences = tokenizer.texts_to_sequences(train_texts)
    test_sequences = tokenizer.texts_to_sequences(test_texts)
    max_seq_length = max(len(seq) for seq in train_sequences)
    train_padded = pad_sequences(train_sequences, maxlen=max_seq_length, padding='post')
    test_padded = pad_sequences(test_sequences, maxlen=max_seq_length, padding='post')

    # Create and compile model
    model = create_model()
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    # Train the model
    model.fit(train_padded, train_scores, epochs=epochs, batch_size=batch_size, verbose=0)

    # Evaluate the model
    mse, _ = model.evaluate(test_padded, test_scores, verbose=0)
    mse_scores.append(mse)

# Compute mean MSE and standard deviation
mean_mse = np.mean(mse_scores)
std_mse = np.std(mse_scores)

print(f"Mean MSE (RNN, lr={learning_rate}, bs={batch_size}): {mean_mse}, Std MSE: {std_mse}")












In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, LSTM, GRU, Dense

# Load preprocessed dataset
dataset = pd.read_csv('preprocessed_arabic_tech_news_dataset.csv')

# Split dataset into features and labels
texts = dataset['Preprocessed Text'].values
scores = dataset['Score'].values

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Define model architectures
def create_rnn_model():
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_seq_length))
    model.add(SimpleRNN(64, return_sequences=True))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    return model

def create_bidirectional_rnn_model():
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_seq_length))
    model.add(Bidirectional(SimpleRNN(64, return_sequences=True)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    return model

def create_lstm_model():
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_seq_length))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    return model

def create_gru_model():
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_seq_length))
    model.add(GRU(64, return_sequences=True))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    return model

# Define hyperparameters
learning_rates = [0.001, 0.01]
batch_sizes = [32, 64]
epochs = 10

# Define cross-validation strategy (e.g., K-Fold)
kf = KFold(n_splits=2, shuffle=True, random_state=42)

# Perform cross-validation for each architecture and hyperparameter combination
architectures = {
    'RNN': create_rnn_model,
    'Bidirectional_RNN': create_bidirectional_rnn_model,
    'LSTM': create_lstm_model,
    'GRU': create_gru_model
}

for architecture, create_model_func in architectures.items():
    for learning_rate in learning_rates:
        for batch_size in batch_sizes:
            model = create_model_func()
            model.compile(optimizer='adam', loss='mse', metrics=['mae'])
            print(f"Training {architecture} model with learning rate {learning_rate} and batch size {batch_size}")
            scores = cross_val_score(model, texts, scores, cv=kf, scoring='neg_mean_squared_error')
            mse_scores = -scores  # Convert negative scores to positive
            print(f"Mean MSE ({architecture}, lr={learning_rate}, bs={batch_size}): {mse_scores.mean()}, Std MSE: {mse_scores.std()}")





In [None]:
pip install pytorch-transformers


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load pre-trained GPT2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Fine-tune the model on your customized dataset
# Assuming you have your own dataset in a text file, you need to create a TextDataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/CV_Houda_kaissi (1).docx',  # Path to your customized dataset
    block_size=128  # Adjust block size according to your dataset
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir='./output',
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust number of epochs as needed
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()
