In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

In [2]:
# Load the dataset
#تحميل البيانات:

titanic_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(titanic_url)

In [3]:
# Preprocess the Dataset
#معالجة البيانات:

# Handle Missing Values
#هون عم اتعامل مع القيم يلي ناقصة
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#عم اتعامل و عالج الكلاسات او الكاتيجوريز
categorical_features = ['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [17]:
# Remove Duplicate Data
#ما بدا شرح أكتر من هيك :( 
data = data.drop_duplicates()

# Split the dataset
#عم قسم البيانات 80 ب 20
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Create a preprocessing and modeling pipeline
#بايب لاين مشان النمذجة
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier())])

In [6]:
# Define the parameter grid for hyperparameter tuning
# بعد عدّة تجارب توصلت لهل الحل
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth': [4, 6, 8],
    'classifier__criterion': ['gini', 'entropy']
}

In [7]:
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='precision')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Age',
                                                                          'Fare']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                              

In [8]:
# Best model from grid search
# عم شوف تقييم افضل مودل
best_model = grid_search.best_estimator_

In [9]:
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='precision')
print(f"Cross-Validation Precision Scores: {cv_scores}")
print(f"Mean Cross-Validation Precision: {cv_scores.mean()}")

Cross-Validation Precision Scores: [0.90625    0.84444444 0.80434783 0.82857143 0.91666667]
Mean Cross-Validation Precision: 0.8600560731538993


In [10]:
# Final evaluation on the test set
y_pred = best_model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("RandomForestClassifier - Best Model from GridSearchCV")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

RandomForestClassifier - Best Model from GridSearchCV
Precision: 0.8275862068965517
Recall: 0.6486486486486487


In [11]:
# Experiment with a second model: Logistic Regression
#تجربة المعاملات باستخدام RandomizedSearchCV
model_pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', LogisticRegression())])


In [12]:
# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'classifier__penalty': ['l1', 'l2', 'none'],
    'classifier__C': np.logspace(-4, 4, 20),
    'classifier__solver': ['lbfgs', 'liblinear']
}

In [13]:
# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(model_pipeline_lr, param_grid_lr, n_iter=100, cv=5, scoring='recall', random_state=42)
random_search.fit(X_train, y_train)


Traceback (most recent call last):
  File "C:\Users\Taiser\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Taiser\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Taiser\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Taiser\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 454, in _check_solver
    raise ValueError(
ValueError: penalty='none' is not supported for the liblinear solver

Traceback (most recent call last):
  File "C:\Users\Taiser\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Taiser\anaconda3\lib\site-packages\sklear

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['Age',
                                                                                'Fare']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('impute

In [14]:
# Best model from random search
#برجع بعمل تقييم لأفضل نموذج
best_model_lr = random_search.best_estimator_

In [15]:
# Evaluate the best model with cross-validation
cv_scores_lr = cross_val_score(best_model_lr, X_train, y_train, cv=5, scoring='recall')
print(f"Cross-Validation Recall Scores: {cv_scores_lr}")
print(f"Mean Cross-Validation Recall: {cv_scores_lr.mean()}")

Cross-Validation Recall Scores: [0.62962963 0.7037037  0.69811321 0.56603774 0.74074074]
Mean Cross-Validation Recall: 0.6676450034940601


In [16]:
# Final evaluation on the test set
y_pred_lr = best_model_lr.predict(X_test)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)

print("LogisticRegression - Best Model from RandomizedSearchCV")
print(f"Precision: {precision_lr}")
print(f"Recall: {recall_lr}")

LogisticRegression - Best Model from RandomizedSearchCV
Precision: 0.7611940298507462
Recall: 0.6891891891891891


In [18]:
#و شكراً و عفواً

In [19]:
#محمد تيسير قدلوا