In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:

import pandas as pd

# تحميل البيانات (لو مش محملة بالفعل)
df = pd.read_csv(r"D:\eino\data.csv")

# تحويل الأعمدة إلى أرقام
cols_to_numeric = ['bedrooms', 'bathrooms', 'level']
for col in cols_to_numeric:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# معالجة القيم الناقصة في furnished
df['furnished'] = df['furnished'].fillna('unknown')

# حذف الصفوف اللي فيها نواقص في الأعمدة الرقمية
df.dropna(subset=cols_to_numeric, inplace=True)

# التأكد من التنظيف
print("Shape after cleaning:", df.shape)
print(df[cols_to_numeric].dtypes)




Shape after cleaning: (42514, 11)
bedrooms     float64
bathrooms    float64
level        float64
dtype: object


In [3]:
# تقسيم السعر إلى 3 فئات
df['price_category'] = pd.qcut(
    df['price'],
    q=3,
    labels=['Low', 'Medium', 'High']
)

# التحقق من توزيع الفئات
print(df['price_category'].value_counts())



price_category
Low       14270
Medium    14201
High      14043
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split

# اختيار الخصائص (features) واستبعاد الأعمدة اللي مش لازمة
X = df.drop(columns=['ad_id', 'price', 'price_category'])

# One-Hot Encoding للأعمدة النصية
X_encoded = pd.get_dummies(X, columns=[
    'type', 'furnished', 'rent', 'city', 'region'
], drop_first=True)

# تحديد الهدف (Target)
y = df['price_category']

# تقسيم البيانات إلى تدريب واختبار
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# عرض الأشكال النهائية للبيانات
X_train.shape, X_test.shape


((34011, 211), (8503, 211))

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# تدريب Random Forest
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# التنبؤ
y_pred = model.predict(X_test)

# التقييم
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.8547571445372222
              precision    recall  f1-score   support

        High       0.80      0.95      0.87      2832
         Low       0.90      0.95      0.93      2858
      Medium       0.87      0.66      0.75      2813

    accuracy                           0.85      8503
   macro avg       0.86      0.85      0.85      8503
weighted avg       0.86      0.85      0.85      8503



In [6]:
# تحسين Random Forest بزيادة عدد الأشجار والعمق
improved_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    random_state=42,
    n_jobs=-1
)
improved_model.fit(X_train, y_train)

# التنبؤ
improved_y_pred = improved_model.predict(X_test)

# التقييم
print("Accuracy:", accuracy_score(y_test, improved_y_pred))
print(classification_report(y_test, improved_y_pred))


Accuracy: 0.9235563918616959
              precision    recall  f1-score   support

        High       0.90      0.95      0.93      2832
         Low       0.96      0.97      0.96      2858
      Medium       0.92      0.85      0.88      2813

    accuracy                           0.92      8503
   macro avg       0.92      0.92      0.92      8503
weighted avg       0.92      0.92      0.92      8503



In [7]:
# تحسين إضافي للموديل
tuned_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=30,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
tuned_model.fit(X_train, y_train)

# التنبؤ
tuned_y_pred = tuned_model.predict(X_test)

# التقييم
print("Accuracy:", accuracy_score(y_test, tuned_y_pred))
print(classification_report(y_test, tuned_y_pred))


Accuracy: 0.8976831706456545
              precision    recall  f1-score   support

        High       0.87      0.94      0.90      2832
         Low       0.93      0.96      0.95      2858
      Medium       0.89      0.79      0.84      2813

    accuracy                           0.90      8503
   macro avg       0.90      0.90      0.90      8503
weighted avg       0.90      0.90      0.90      8503



In [8]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

# نختار عينة صغيرة (5000 سجل فقط)
sample_size = 5000
X_sample = X_train[:sample_size]
y_sample = y_train[:sample_size]

# تدريب Linear SVM
svm_model = LinearSVC(max_iter=5000, random_state=42)
svm_model.fit(X_sample, y_sample)

# التنبؤ على نفس العينة (أو جزء منها لو حبيت)
y_sample_pred = svm_model.predict(X_test[:2000])  # لتسريع التقييم

# التقييم
print("Accuracy:", accuracy_score(y_test[:2000], y_sample_pred))
print(classification_report(y_test[:2000], y_sample_pred))


Accuracy: 0.758
              precision    recall  f1-score   support

        High       0.70      0.95      0.81       635
         Low       0.81      0.95      0.87       696
      Medium       0.79      0.38      0.51       669

    accuracy                           0.76      2000
   macro avg       0.77      0.76      0.73      2000
weighted avg       0.77      0.76      0.73      2000



In [10]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=1000,
    multi_class='multinomial',
    solver='lbfgs',
    n_jobs=-1,
    random_state=42
)
log_reg.fit(X_train, y_train)

y_pred_log = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))




Logistic Regression Accuracy: 0.7891332470892626
              precision    recall  f1-score   support

        High       0.77      0.88      0.83      2832
         Low       0.86      0.89      0.87      2858
      Medium       0.72      0.59      0.65      2813

    accuracy                           0.79      8503
   macro avg       0.78      0.79      0.78      8503
weighted avg       0.78      0.79      0.78      8503



In [11]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)
lgb_model.fit(X_train, y_train)

y_pred_lgb = lgb_model.predict(X_test)

print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 460
[LightGBM] [Info] Number of data points in the train set: 34011, number of used features: 101
[LightGBM] [Info] Start training from score -1.109789
[LightGBM] [Info] Start training from score -1.092019
[LightGBM] [Info] Start training from score -1.094124
LightGBM Accuracy: 0.8596965776784664
              precision    recall  f1-score   support

        High       0.82      0.91      0.86      2832
         Low       0.92      0.95      0.94      2858
      Medium       0.84      0.71      0.77      2813

    accuracy                           0.86      8503
   macro avg       0.86      0.86      0.86      8503
weighted avg       0.86      0.86      0.86      8503



In [12]:
import joblib

# حفظ الموديل
joblib.dump(model, 'random_forest_model.pkl')

# حفظ أسماء أعمدة الـ One-Hot Encoded
joblib.dump(X_encoded.columns.tolist(), 'model_features.pkl')


['model_features.pkl']