<a href="https://colab.research.google.com/github/Rezvanpm/Phi-FineTune/blob/main/Bachelor_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phi-Finetuning

مراحل پروژه:


1.   لود داده ها از دیتابیس
2.   لود مدل زبانی
3. لود ابزار های لازم
4. نصب و لود فریمورک های مورد نیاز


## Preview of project:

## Install and load frameworks, dependencies, etc.


In [None]:
!pip install ipywidgets # install widgets for using in colab

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


## 1.load dataset from folder/upload dataset from user

In [None]:
!pip install pandas
!pip install ipywidgets



## 2.Dataset selection from menu bar

In [None]:
import pandas as pd
import os
import time
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import files

# ======= داده‌ها و تنظیمات =======
selected_dataset = None  # متغیر برای ذخیره دیتاست انتخاب‌شده

# لیست دیتاست‌های پیش‌فرض
default_datasets = {
    "Titanic Dataset": "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
    "Iris Dataset": "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
}

# ======= ویجت‌های رابط کاربری =======

# منو کشویی برای انتخاب دیتاست
dataset_dropdown = widgets.Dropdown(
    options=["Select a Dataset"] + list(default_datasets.keys()),
    value="Select a Dataset",
    description="Dataset:"
)

# دکمه انتخاب دیتاست
apply_button = widgets.Button(description="Apply", button_style="success")

# ویجت آپلود فایل
upload_button = widgets.FileUpload(
    accept=".csv",
    multiple=False
)

# Progress Bar (در ابتدا مخفی)
progress_bar = widgets.FloatProgress(
    value=0.0,
    min=0,
    max=1.0,
    description="Processing:",
    bar_style="info",
    layout=widgets.Layout(visibility='hidden')  # مخفی بودن در ابتدا
)

# نمایش ویجت‌ها در کنار هم
dataset_box = widgets.VBox([widgets.Label("Select a Dataset:"), dataset_dropdown])
upload_box = widgets.VBox([widgets.Label("Upload your Dataset:"), upload_button])
ui = widgets.HBox([dataset_box, upload_box])

# خروجی برای نمایش دیتاست (با قابلیت اسکرول)
output = widgets.Output()
scrollable_output = widgets.VBox([output], layout=widgets.Layout(height="300px", overflow="auto", border="1px solid #ccc"))

# نمایش اجزای UI
display(ui, apply_button, progress_bar, scrollable_output)

# ======= توابع پردازش انتخاب و آپلود =======

def load_dataset(dataset_path):
    """بارگذاری دیتاست از مسیر مشخص‌شده"""
    if dataset_path.startswith("http"):
        data = pd.read_csv(dataset_path)
    elif os.path.exists(dataset_path):
        data = pd.read_csv(dataset_path)
    else:
        raise FileNotFoundError(f"Dataset {dataset_path} not found.")
    return data

def select_dataset(change):
    """مدیریت انتخاب دیتاست از منوی کشویی"""
    global selected_dataset
    if dataset_dropdown.value != "Select a Dataset":
        selected_dataset = default_datasets[dataset_dropdown.value]
        upload_button.disabled = True  # غیرفعال کردن آپلود هنگام انتخاب دیتاست

dataset_dropdown.observe(select_dataset, names="value")

def handle_upload(change):
    """مدیریت آپلود فایل"""
    global selected_dataset

    uploaded_file = next(iter(upload_button.value))
    content = upload_button.value[uploaded_file]["content"]
    dataset_path = f"/content/{uploaded_file}"

    # ذخیره فایل روی دیسک
    with open(dataset_path, "wb") as f:
        f.write(content)

    selected_dataset = dataset_path
    dataset_dropdown.disabled = True  # غیرفعال کردن لیست هنگام آپلود فایل

upload_button.observe(handle_upload, names="value")

def apply_selection(event):
    """مدیریت پردازش نهایی دیتاست پس از زدن دکمه Apply"""
    global selected_dataset
    if selected_dataset:
        progress_bar.layout.visibility = 'visible'
        progress_bar.value = 0

        # شبیه‌سازی روند پردازش
        for i in range(10):
            time.sleep(0.3)
            progress_bar.value += 0.1

        progress_bar.layout.visibility = 'hidden'  # مخفی کردن پس از پردازش

        # نمایش دیتاست در خروجی
        with output:
            clear_output()
            print(f"📂 Using dataset: {selected_dataset}")
            data = load_dataset(selected_dataset)
            display(data)  # نمایش کل دیتاست
            print("✅ Dataset has been saved for the next steps.")
    else:
        with output:
            clear_output()
            print("⚠️ Please select or upload a dataset before applying.")

apply_button.on_click(apply_selection)

HBox(children=(VBox(children=(Label(value='Select a Dataset:'), Dropdown(description='Dataset:', options=('Sel…

Button(button_style='success', description='Apply', style=ButtonStyle())

FloatProgress(value=0.0, bar_style='info', description='Processing:', layout=Layout(visibility='hidden'), max=…

VBox(children=(Output(),), layout=Layout(border='1px solid #ccc', height='300px', overflow='auto'))

## 4.Define preprocessing steps

In [None]:
# Define preprocessing techniques
def remove_irrelevant_columns(data, columns):
    """Remove specified irrelevant columns from the dataset."""
    return data.drop(columns=columns, errors='ignore')

def remove_noise(data):
    """Remove outliers based on the IQR method for numeric columns."""
    for column in data.select_dtypes(include=[np.number]).columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        data = data[~((data[column] < (Q1 - 1.5 * IQR)) | (data[column] > (Q3 + 1.5 * IQR)))]
    return data

def remove_missing_values(data):
    """Remove rows with missing values."""
    return data.dropna()

def convert_categorical_to_numeric(data):
    """Convert categorical columns to numeric using LabelEncoder."""
    for column in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))
    return data

def convert_to_datetime(data, columns):
    """Convert specified columns to datetime."""
    for column in columns:
        data[column] = pd.to_datetime(data[column], errors='coerce')
    return data

def normalize_columns(data):
    """Normalize numeric columns using MinMaxScaler."""
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column] = MinMaxScaler().fit_transform(data[[column]])
    return data

def standardize_columns(data):
    """Standardize numeric columns using StandardScaler."""
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column] = StandardScaler().fit_transform(data[[column]])
    return data

def tokenize_text(data):
    """Tokenize text columns using a pretrained tokenizer."""
    for column in data.select_dtypes(include=['object']).columns:
        data[column] = data[column].apply(
            lambda x: tokenizer.encode(str(x), truncation=True, padding=True) if isinstance(x, str) and x.strip() else []
        )
    return data

## Split into test and train

*   
*   



## 3.Define Algorithms for preprocessing


*   AutoML
*   Search-based Methods
*   Feature-Based Ordering



###AutoML


1.   TPOT
2.   H2o.ai


In [None]:
!pip install tpot

Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xgboost>=1.1.0 (from tpot)
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost>=1.1.0->tpot)
  Downloading nvidia_nccl_cu12-2.25.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deap-1.4.2-cp311-cp311-manylinu

In [None]:
# AutoML - TPOT

# !pip install tpot

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def automl_algorithm(data):
    """AutoML: خودکار انتخاب مدل و پارامترها با استفاده از TPOT"""

    # جدا کردن ویژگی‌ها و هدف
    X = data.drop(columns=['target'])  # فرض بر اینکه ستون هدف 'target' نام دارد
    y = data['target']

    # تقسیم داده‌ها به مجموعه‌های آموزش و تست
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ایجاد مدل TPOT و آموزش آن
    tpot = TPOTClassifier( generations=5, population_size=20, random_state=42, verbosity=2 )
    tpot.fit(X_train, y_train)

    # ارزیابی مدل بر روی داده‌های تست
    y_pred = tpot.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"AutoML Model Accuracy: {accuracy*100:.2f}%")

    # مدل بهینه‌شده را ذخیره کرده و چاپ می‌کنیم
    tpot.export('best_model_pipeline.py')
    print("Best model pipeline exported to 'best_model_pipeline.py'.")

    return tpot

In [None]:
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.6.tar.gz (265.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.8/265.8 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tabulate (from h2o)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.46.0.6-py2.py3-none-any.whl size=265859786 sha256=4a4059b3e49757be98e8220352ebb66a4d418fce96d0cd524d0533f7be054cf7
  Stored in directory: /root/.cache/pip/wheels/62/f9/aa/687bd54342d2981bc78e22ee9b9bc39f92006e344e7aa1e0ac
Successfully built h2o
Installing collected packages: tabulate, h2o
Successfully installed h2o-3.46.0.6 tabulate-0.9.0


In [None]:
# AutoML - H2o.ai

# !pip install h2o

import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def automl_algorithm_h2o(data):
    """AutoML: خودکار انتخاب مدل و پارامترها با استفاده از H2O.ai"""

    # فرض بر اینکه ستون هدف 'target' نام دارد
    X = data.drop(columns=['target'])
    y = data['target']

    # تبدیل داده‌ها به فرمت H2O
    h2o.init()  # راه‌اندازی H2O
    data_h2o = h2o.H2OFrame(data)

    # تقسیم داده‌ها به مجموعه‌های آموزش و تست
    train, test = data_h2o.split_frame(ratios=[0.8], seed=42)

    # تعریف و آموزش مدل H2O AutoML
    aml = H2OAutoML(max_models=20, seed=42)
    aml.train(y='target', training_frame=train)

    # پیش‌بینی با مدل‌ها
    predictions = aml.predict(test)

    # ارزیابی مدل
    accuracy = aml.leaderboard.loc[0, 'accuracy']
    print(f"AutoML Model Accuracy: {accuracy * 100:.2f}%")

    return aml

###Search-based Methods


*   Randomized Search CV
*   Sequential Model-based Optimization (SMBO)



In [None]:
# Search-based methods - RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def search_based_algorithm(data):
    """Search-based Methods: استفاده از جستجوی تصادفی برای انتخاب بهترین مدل و هایپرپارامترها"""

    # فرض بر اینکه ستون هدف 'target' نام دارد
    X = data.drop(columns=['target'])
    y = data['target']

    # تقسیم داده‌ها به مجموعه‌های آموزش و تست
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # مدل‌های مختلف برای جستجو
    models = {
        'RandomForest': RandomForestClassifier(),
        'SVM': SVC(),
        'LogisticRegression': LogisticRegression()
    }

    # هایپرپارامترهای مدل‌ها برای جستجو
    param_distributions = {
        'RandomForest': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        },
        'LogisticRegression': {
            'C': [0.1, 1, 10],
            'solver': ['lbfgs', 'liblinear']
        }
    }

    best_model = None
    best_score = 0.0

    # جستجوی تصادفی برای انتخاب بهترین مدل و هایپرپارامتر
    for model_name, model in models.items():
        print(f"Running RandomizedSearchCV for {model_name}...")

        # جستجوی تصادفی برای هایپرپارامترها
        random_search = RandomizedSearchCV(model, param_distributions[model_name], n_iter=10, cv=5, random_state=42, n_jobs=-1)
        random_search.fit(X_train, y_train)

        # ارزیابی مدل
        y_pred = random_search.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Best parameters for {model_name}: {random_search.best_params_}")
        print(f"Accuracy: {accuracy * 100:.2f}%")

        if accuracy > best_score:
            best_score = accuracy
            best_model = random_search.best_estimator_

    print(f"\nBest Model: {best_model}")
    print(f"Best Accuracy: {best_score * 100:.2f}%")

    return best_model

In [None]:
# Search-based Methods - GridsearchCV

from sklearn.model_selection import GridSearchCV

def search_based_algorithm_grid_search(data):
    """Search-based Methods: استفاده از جستجوی شبکه‌ای برای انتخاب بهترین مدل و هایپرپارامترها"""

    # فرض بر اینکه ستون هدف 'target' نام دارد
    X = data.drop(columns=['target'])
    y = data['target']

    # تقسیم داده‌ها به مجموعه‌های آموزش و تست
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # مدل‌ها برای جستجو
    models = {
        'RandomForest': RandomForestClassifier(),
        'SVM': SVC(),
        'LogisticRegression': LogisticRegression()
    }

    # هایپرپارامترهای مدل‌ها برای جستجو
    param_grid = {
        'RandomForest': {
            'n_estimators': [50, 100],
            'max_depth': [None, 10],
            'min_samples_split': [2, 5]
        },
        'SVM': {
            'C': [0.1, 1],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale']
        },
        'LogisticRegression': {
            'C': [0.1, 1],
            'solver': ['lbfgs']
        }
    }

    best_model = None
    best_score = 0.0

    # جستجوی شبکه‌ای برای انتخاب بهترین مدل و هایپرپارامتر
    for model_name, model in models.items():
        print(f"Running GridSearchCV for {model_name}...")

        # جستجوی شبکه‌ای برای هایپرپارامترها
        grid_search = GridSearchCV(model, param_grid[model_name], cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # ارزیابی مدل
        y_pred = grid_search.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy * 100:.2f}%")

        if accuracy > best_score:
            best_score = accuracy
            best_model = grid_search.best_estimator_

    print(f"\nBest Model: {best_model}")
    print(f"Best Accuracy: {best_score * 100:.2f}%")

    return best_model

###Feature-based Ordering

*   Random Forest Classifier (ex: Decision tree)
*   Gradient Boosting (ex: XGBoost, LightGBM)



In [None]:
# Feature-Based Ordering
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

def feature_based_ordering(data):
    """Feature-based Ordering: مرتب‌سازی ویژگی‌ها بر اساس اهمیت آن‌ها برای مدل"""

    # فرض بر اینکه ستون هدف 'target' نام دارد
    X = data.drop(columns=['target'])
    y = data['target']

    # تقسیم داده‌ها به مجموعه‌های آموزش و تست
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # مدل RandomForest برای محاسبه اهمیت ویژگی‌ها
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # استخراج اهمیت ویژگی‌ها
    feature_importance = model.feature_importances_

    # مرتب‌سازی ویژگی‌ها بر اساس اهمیت آن‌ها
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importance
    })

    # مرتب‌سازی ویژگی‌ها از مهم‌ترین به کم‌اهمیت‌ترین
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    print("Feature Importance Ranking:")
    print(feature_importance_df)

    # مرتب‌سازی داده‌ها بر اساس ترتیب ویژگی‌ها
    ordered_features = feature_importance_df['Feature'].tolist()
    ordered_data = data[ordered_features + ['target']]  # ترتیب جدید داده‌ها

    return ordered_data, feature_importance_df

In [None]:
# Feature-Based Ordering
# Gradient Boosting

import xgboost as xgb

def feature_based_ordering_xgb(data):
    """Feature-based Ordering using XGBoost for calculating feature importance"""

    # فرض بر اینکه ستون هدف 'target' نام دارد
    X = data.drop(columns=['target'])
    y = data['target']

    # تقسیم داده‌ها به مجموعه‌های آموزش و تست
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # مدل XGBoost برای محاسبه اهمیت ویژگی‌ها
    model = xgb.XGBClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # استخراج اهمیت ویژگی‌ها
    feature_importance = model.feature_importances_

    # مرتب‌سازی ویژگی‌ها بر اساس اهمیت آن‌ها
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importance
    })

    # مرتب‌سازی ویژگی‌ها از مهم‌ترین به کم‌اهمیت‌ترین
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    print("Feature Importance Ranking:")
    print(feature_importance_df)

    # مرتب‌سازی داده‌ها بر اساس ترتیب ویژگی‌ها
    ordered_features = feature_importance_df['Feature'].tolist()
    ordered_data = data[ordered_features + ['target']]  # ترتیب جدید داده‌ها

    return ordered_data, feature_importance_df

## 5.Merge into 3 and 4

In [None]:
!pip install ipywidgets scikit-learn pandas



In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.model_selection import train_test_split
import time
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import numpy as np
import io  # Import the 'io' module
from google.colab import files

# ------------------------------
# 1. Preprocessing Functions (Defined Here for Clarity)
# ------------------------------
def remove_irrelevant_columns(data, columns):
    """Remove specified irrelevant columns from the dataset."""
    return data.drop(columns=columns, errors='ignore')

def remove_noise(data):
    """Remove outliers based on the IQR method for numeric columns."""
    for column in data.select_dtypes(include=[np.number]).columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        data = data[~((data[column] < (Q1 - 1.5 * IQR)) | (data[column] > (Q3 + 1.5 * IQR)))]
    return data

def remove_missing_values(data):
    """Remove rows with missing values."""
    return data.dropna()

def convert_categorical_to_numeric(data):
    """Convert categorical columns to numeric using LabelEncoder."""
    for column in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))
    return data

def convert_to_datetime(data, columns):
    """Convert specified columns to datetime."""
    for column in columns:
        data[column] = pd.to_datetime(data[column], errors='coerce')
    return data

def normalize_columns(data):
    """Normalize numeric columns using MinMaxScaler."""
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column] = MinMaxScaler().fit_transform(data[[column]])
    return data

def standardize_columns(data):
    """Standardize numeric columns using StandardScaler."""
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column] = StandardScaler().fit_transform(data[[column]])
    return data

def tokenize_text(data, tokenizer):
    """Tokenize text columns using a pretrained tokenizer."""
    for column in data.select_dtypes(include=['object']).columns:
        data[column] = data[column].astype(str).apply(  # Ensure string type
            lambda x: tokenizer.encode(x, truncation=True, padding=True) if pd.notna(x) else []
        )
    return data

# ------------------------------
# 2. Algorithm Selection Widget
# ------------------------------

algorithm_dropdown = widgets.Dropdown(
    options=['autoML', 'search-based', 'feature-ordering'],
    value=None,  # No initial selection
    description='Algorithm:',
    disabled=False,
    style={'description_width': 'initial'}
)

sub_choice_dropdown = widgets.Dropdown(
    options=[],  # Empty initial options
    value=None,
    description='Sub-Algorithm:',
    disabled=True,
    style={'description_width': 'initial'}
)

# Hide the "None" option
algorithm_dropdown.options = [opt for opt in algorithm_dropdown.options if opt is not None]

def update_sub_options(*args):
    algorithm = algorithm_dropdown.value
    sub_choice_dropdown.disabled = False
    if algorithm == 'autoML':
        sub_choice_dropdown.options = ['TPOT', 'H2O.ai']
    elif algorithm == 'search-based':
        sub_choice_dropdown.options = ['randomized-searchCV', 'SMBO']
    elif algorithm == 'feature-ordering':
        sub_choice_dropdown.options = ['random forest classifier', 'Gradient boosting']
    else:
        sub_choice_dropdown.options = []
        sub_choice_dropdown.disabled = True  # Disable if no algorithm selected

    if sub_choice_dropdown.options:
        sub_choice_dropdown.value = sub_choice_dropdown.options[0]
    else:
        sub_choice_dropdown.value = None

algorithm_dropdown.observe(update_sub_options, 'value')

# ------------------------------
# 3. Algorithm Functions (Placeholders)
# ------------------------------

def run_automl(data, sub_algorithm):
    """Runs the specified AutoML algorithm."""
    print(f"Running AutoML with {sub_algorithm}...")  # Placeholder
    preprocessed_data = preprocess_with_automl(data, sub_algorithm)
    #display(preprocessed_data.head()) #Visulaize part is available in `on_run_clicked`
    return preprocessed_data

def run_search_based(data, sub_algorithm):
    """Runs the specified search-based algorithm."""
    print(f"Running Search-Based method with {sub_algorithm}...")  # Placeholder
    preprocessed_data = preprocess_with_search_methods(data, sub_algorithm)
    #display(preprocessed_data.head())  # Visulaize part is available in `on_run_clicked`
    return preprocessed_data

def run_feature_ordering(data, sub_algorithm):
    """Runs the specified feature ordering algorithm."""
    print(f"Running Feature Ordering with {sub_algorithm}...")  # Placeholder
    preprocessed_data = preprocess_with_feature_ordering(data, sub_algorithm)
    #display(preprocessed_data.head())  # Visulaize part is available in `on_run_clicked`
    return preprocessed_data

# ------------------------------
# plus. تعریف توابع جدا جدا و map کردن انها
# ------------------------------

def preprocess_with_automl(data, sub_algorithm):
    """Preprocesses data using the selected AutoML sub-algorithm."""

    processed_data = data.copy()

    if sub_algorithm == 'TPOT':
        # ... call your TPOT preprocessing logic here ...
        processed_data = remove_missing_values(processed_data)
        processed_data = convert_categorical_to_numeric(processed_data)  # If applicable
        if processed_data.select_dtypes(include=np.number).columns.any():
          processed_data[processed_data.select_dtypes(include=np.number).columns] = StandardScaler().fit_transform(processed_data[processed_data.select_dtypes(include=np.number).columns])
        return processed_data  # Return the preprocessed data
    elif sub_algorithm == 'H2O.ai':
        # ... call your H2O.ai preprocessing logic here ...
        processed_data = remove_missing_values(processed_data)
        return processed_data
    else:
        raise ValueError("Invalid AutoML sub-algorithm.")

def preprocess_with_search_methods(data, method):
    """Preprocesses data using the selected search-based method."""

    processed_data = data.copy()
    if method == 'randomized-searchCV':
        # ... call your RandomizedSearchCV preprocessing logic here ...
        processed_data = remove_missing_values(processed_data)
        return processed_data
    elif method == 'SMBO':
        # ... call your SMBO preprocessing logic here ...
        processed_data = remove_missing_values(processed_data)
        return processed_data
    else:
        raise ValueError("Invalid search-based method.")

def preprocess_with_feature_ordering(data, technique):
    """Preprocesses data using the selected feature-ordering technique."""

    processed_data = data.copy()
    if technique == 'random forest classifier':
        # ... call your Random Forest feature ordering logic here ...
        processed_data = remove_missing_values(processed_data)
        return processed_data
    elif technique == 'Gradient boosting':
        # ... call your Gradient Boosting feature ordering logic here ...
        processed_data = remove_missing_values(processed_data)
        return processed_data
    else:
        raise ValueError("Invalid feature-ordering technique.")

# ------------------------------
# 4. Main Run Function
# ------------------------------

run_button = widgets.Button(description="Apply Process", button_style="success")

def on_run_clicked(b):
    progress_bar_process = widgets.FloatProgress(
        value=0.0,
        min=0,
        max=1.0,
        description="Processing:",
        bar_style="info",
        layout=widgets.Layout(visibility='visible')
    )
    display(progress_bar_process)

    with output:
        clear_output()

        if 'loaded_data' not in globals() or loaded_data is None:
            print("Please load a dataset first.")
            progress_bar_process.layout.visibility = 'hidden'
            return

        algorithm = algorithm_dropdown.value
        sub_algorithm = sub_choice_dropdown.value

        if not algorithm or not sub_algorithm:
            print("Please select both an algorithm and a sub-algorithm.")
            progress_bar_process.layout.visibility = 'hidden'
            return

        print(f"Running with Algorithm: {algorithm}, Sub-Algorithm: {sub_algorithm}")

        # 1. Run Algorithm based on Selection
        try:
            if algorithm == 'autoML':
                processed_data = run_automl(loaded_data, sub_algorithm)
            elif algorithm == 'search-based':
                processed_data = run_search_based(loaded_data, sub_algorithm)
            elif algorithm == 'feature-ordering':
                processed_data = run_feature_ordering(loaded_data, sub_algorithm)
            else:
                print("Invalid Algorithm Selection")
                progress_bar_process.layout.visibility = 'hidden'
                return

            if processed_data is None:
                print("No result process to run.")
                progress_bar_process.layout.visibility = 'hidden'
                return

            # Update Progress Bar during a real-process
            for i in range(5):
              time.sleep(0.2)
              progress_bar_process.value += 0.2

            # 2. Visualize the Table (or some information about it)
            display(processed_data.head())

            # Save it for Future Usings
            global processed_data_saved
            processed_data_saved = processed_data
            print("Process data saved for future use.")

        except Exception as e:
            print(f"An Error occurred: {e}")
            processed_data_saved = None

        finally:
          progress_bar_process.layout.visibility='hidden'


run_button.on_click(on_run_clicked)

# ------------------------------
# 5. Display Widgets
# ------------------------------

display(algorithm_dropdown, sub_choice_dropdown, run_button, output)

Dropdown(description='Algorithm:', options=('autoML', 'search-based', 'feature-ordering'), style=DescriptionSt…

Dropdown(description='Sub-Algorithm:', disabled=True, options=(), style=DescriptionStyle(description_width='in…

Button(button_style='success', description='Apply Process', style=ButtonStyle())

Output()

In [None]:
# ... (Rest of your code: preprocessing functions, algorithm definitions, widgets setup, etc. remains the same) ...

# ------------------------------
# 3. Algorithm Runner Functions - MODIFIED for Dynamic Preprocessing Calls
# ------------------------------

def run_automl(data, sub_algorithm):
    """Runs the specified AutoML algorithm with preprocessing."""
    print(f"Running AutoML with {sub_algorithm}...")

    # --- Dynamic Preprocessing Calls for AutoML ---
    print("Applying preprocessing for AutoML...") # DEBUG
    processed_data = data.copy() # Start with a copy to avoid modifying original

    # Base preprocessing for all AutoML methods:
    processed_data = remove_missing_values(processed_data)
    processed_data = convert_categorical_to_numeric(processed_data) # Example preprocessing step for AutoML

    # No algorithm-specific preprocessing in this example for AutoML beyond base steps

    # --- Call the AutoML Algorithm (Placeholder - replace with your actual AutoML logic) ---
    print("Calling AutoML algorithm implementation...") # DEBUG
    # In a real implementation, you would call TPOT, auto-sklearn, H2O.ai here, using processed_data
    # For now, just return preprocessed_data as a placeholder:
    return processed_data


def run_search_based(data, sub_algorithm):
    """Runs the specified search-based algorithm with preprocessing."""
    print(f"Running Search-Based method with {sub_algorithm}...")

    # --- Dynamic Preprocessing Calls for Search-Based Methods ---
    print("Applying preprocessing for Search-based...") # DEBUG
    processed_data = data.copy()  # Start with a copy

    # Base preprocessing for all Search-based methods:
    processed_data = remove_missing_values(processed_data)
    processed_data = convert_categorical_to_numeric(processed_data) # Example preprocessing for search-based

    # No algorithm-specific preprocessing for Search-based beyond base steps in this example

    # --- Call the Search-Based Algorithm (GridSearchCV or SMBO) ---
    if sub_algorithm == 'GridSearchCV':
        algorithm_function_name = "search_based_algorithm_grid_search"
    elif sub_algorithm == 'SMBO':
        algorithm_function_name = "run_smbo_algorithm"  # Placeholder
    else:
        raise ValueError(f"Invalid Sub-Algorithm for Search-based: {sub_algorithm}")

    if algorithm_function_name in globals():
        print(f"Dynamically calling algorithm function: {algorithm_function_name}...") # DEBUG
        processed_data = globals()[algorithm_function_name](processed_data) # Pass PROCESSED data
    else:
        raise ValueError(f"Algorithm function '{algorithm_function_name}' not found.")
    return processed_data


def run_feature_ordering(data, sub_algorithm):
    """Runs the specified feature ordering algorithm with preprocessing."""
    print(f"Running Feature Ordering with {sub_algorithm}...")

    # --- Dynamic Preprocessing Calls for Feature Ordering ---
    print("Applying preprocessing for Feature Ordering...") # DEBUG
    processed_data = data.copy() # Start with a copy

    # Base preprocessing for all Feature Ordering methods:
    processed_data = remove_missing_values(processed_data)
    # Feature ordering methods might not need convert_categorical_to_numeric, or other preprocessing
    # In this example, we are *only* using remove_missing_values

    # No algorithm-specific preprocessing for Feature Ordering beyond base steps in this example

    # --- Call the Feature Ordering Algorithm (Placeholder - replace with your actual Feature Ordering logic) ---
    print("Calling Feature Ordering algorithm implementation...") # DEBUG
    # In a real implementation, you would call Random Forest or Gradient Boosting feature ordering logic
    # For now, just return preprocessed_data as a placeholder:
    return processed_data

# ... (Rest of your code: preprocess_with_..., search_based_algorithm_grid_search, on_run_clicked, widgets display - remain the same) ...

In [None]:
!pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
Successfully installed scikit-learn-1.6.1


In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import time
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import numpy as np
import io
from google.colab import files
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ------------------------------
# 1. Define Preprocessing Functions
# ------------------------------
def remove_irrelevant_columns(data, columns):
    """Remove specified irrelevant columns from the dataset."""
    return data.drop(columns=columns, errors='ignore')

def remove_noise(data):
    """Remove outliers based on the IQR method for numeric columns."""
    for column in data.select_dtypes(include=[np.number]).columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        data = data[~((data[column] < (Q1 - 1.5 * IQR)) | (data[column] > (Q3 + 1.5 * IQR)))]
    return data

def remove_missing_values(data):
    """Remove rows with missing values."""
    return data.dropna()

def convert_categorical_to_numeric(data):
    """Convert categorical columns to numeric using LabelEncoder."""
    for column in data.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column].astype(str))
    return data

def convert_to_datetime(data, columns):
    """Convert specified columns to datetime."""
    for column in columns:
        data[column] = pd.to_datetime(data[column], errors='coerce')
    return data

def normalize_columns(data):
    """Normalize numeric columns using MinMaxScaler."""
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column] = MinMaxScaler().fit_transform(data[[column]])
    return data

def standardize_columns(data):
    """Standardize numeric columns using StandardScaler."""
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column] = StandardScaler().fit_transform(data[[column]])
    return data

def tokenize_text(data, tokenizer):
    """Tokenize text columns using a pretrained tokenizer."""
    for column in data.select_dtypes(include=['object']).columns:
        data[column] = data[column].astype(str).apply(
            lambda x: tokenizer.encode(str(x), truncation=True, padding=True) if isinstance(x, str) and x.strip() else []
        )
    return data

# ------------------------------
# 2. Algorithm Selection Widgets
# ------------------------------
algorithm_dropdown = widgets.Dropdown(
    options=['autoML', 'search-based', 'feature-ordering'],
    value=None,
    description='Algorithm:',
    disabled=False,
    style={'description_width': 'initial'}
)

sub_choice_dropdown = widgets.Dropdown(
    options=[],
    value=None,
    description='Sub-Algorithm:',
    disabled=True,
    style={'description_width': 'initial'}
)

algorithm_dropdown.options = [opt for opt in algorithm_dropdown.options if opt is not None]

def update_sub_options(*args):
    algorithm = algorithm_dropdown.value
    sub_choice_dropdown.disabled = False
    if algorithm == 'autoML':
        sub_choice_dropdown.options = ['TPOT', 'H2O.ai']
    elif algorithm == 'search-based':
        sub_choice_dropdown.options = ['GridSearchCV', 'SMBO']
    elif algorithm == 'feature-ordering':
        sub_choice_dropdown.options = ['random forest classifier', 'Gradient boosting']
    else:
        sub_choice_dropdown.options = []
        sub_choice_dropdown.disabled = True

    if sub_choice_dropdown.options:
        sub_choice_dropdown.value = sub_choice_dropdown.options[0]
    else:
        sub_choice_dropdown.value = None

algorithm_dropdown.observe(update_sub_options, 'value')

# ------------------------------
# 3. Algorithm Runner Functions
# ------------------------------

def run_automl(data, sub_algorithm):
    """Runs the specified AutoML algorithm."""
    print(f"Running AutoML with {sub_algorithm}...")
    preprocessed_data = preprocess_with_automl(data, sub_algorithm)
    return preprocessed_data

def run_search_based(data, sub_algorithm):
    """Runs the specified search-based algorithm."""
    print(f"Running Search-Based method with {sub_algorithm}...")
    if sub_algorithm == 'GridSearchCV':
        processed_data = search_based_algorithm_grid_search(data) # Calling external function
    elif sub_algorithm == 'SMBO':
        print("SMBO is a placeholder algorithm.")
        processed_data = preprocess_with_search_methods(data, sub_algorithm)
    else:
        raise ValueError(f"Invalid Sub-Algorithm for Search-based: {sub_algorithm}")
    return processed_data

def run_feature_ordering(data, sub_algorithm):
    """Runs the specified feature ordering algorithm."""
    print(f"Running Feature Ordering with {sub_algorithm}...")
    preprocessed_data = preprocess_with_feature_ordering(data, sub_algorithm)
    return preprocessed_data

# ------------------------------
# plus. Preprocessing Implementations (Separated Blocks)
# ------------------------------

def preprocess_with_automl(data, sub_algorithm):
    """Preprocesses data using the selected AutoML sub-algorithm."""
    processed_data = data.copy()
    processed_data = remove_missing_values(processed_data)
    processed_data = convert_categorical_to_numeric(processed_data)
    if processed_data.select_dtypes(include=np.number).columns.any():
        processed_data[processed_data.select_dtypes(include=np.number).columns] = StandardScaler().fit_transform(processed_data[processed_data.select_dtypes(include=np.number).columns])
    return processed_data

def preprocess_with_search_methods(data, method):
    """Preprocesses data for search-based methods."""
    processed_data = data.copy()
    processed_data = remove_missing_values(processed_data)
    return processed_data

def preprocess_with_feature_ordering(data, technique):
    """Preprocesses data for feature ordering techniques."""
    processed_data = data.copy()
    processed_data = remove_missing_values(processed_data)
    return processed_data

# ------------------------------
# 6. Main Run Function
# ------------------------------

run_button = widgets.Button(description="Apply Process", button_style="success")
progress_bar_process = widgets.FloatProgress(
    value=0.0, min=0, max=1.0, description="Processing:", bar_style="info", layout=widgets.Layout(visibility='hidden'))
output_table = widgets.Output()
output = widgets.Output()

def on_run_clicked(b):
    file_selection = widgets.FileUpload(accept=".csv", multiple=False, layout=widgets.Layout(visibility='visible'))
    with output:
        clear_output()
        display(file_selection, progress_bar_process, output_table)

        algorithm = algorithm_dropdown.value
        sub_algorithm = sub_choice_dropdown.value

        if not algorithm or not sub_algorithm:
            print("Please select both an algorithm and a sub-algorithm.")
            progress_bar_process.layout.visibility = 'hidden'
            file_selection.layout.visibility = 'hidden'
            return

        try:
            progress_bar_process.layout.visibility = 'visible'
            progress_bar_process.value = 0.0

            if not file_selection.value:
                print("Please select a dataset from file selection area.")
                progress_bar_process.layout.visibility='hidden'
                file_selection.layout.visibility = 'hidden'
                return

            file_content = list(file_selection.value.values())[0]['content']
            loaded_data = pd.read_csv(io.BytesIO(file_content))
            print(f"Dataset loaded successfully. Data shape: {loaded_data.shape}")

            print(f"Running with Algorithm: {algorithm}, Sub-Algorithm: {sub_algorithm}")

            num_steps = 20
            for i in range(num_steps):
                time.sleep(0.05)
                progress_bar_process.value = (i + 1) / num_steps

            algorithm_runner_function_name = f"run_{algorithm.replace('-', '_')}"
            if algorithm_runner_function_name in globals():
                processed_data = globals()[algorithm_runner_function_name](loaded_data, sub_algorithm)
            else:
                raise ValueError(f"Algorithm runner function '{algorithm_runner_function_name}' not found.")

            if processed_data is None:
                raise ValueError("No result process to run.")

            with output_table:
                clear_output()
                display(processed_data.head())

            global processed_data_saved
            processed_data_saved = processed_data
            print("Process data saved for future use.")

        except ValueError as ve:
            print(f"ValueError: {ve}")
            processed_data_saved = None
        except Exception as e:
            print(f"An Error occurred: {e}")
            processed_data_saved = None

        finally:
            progress_bar_process.layout.visibility='hidden'
            file_selection.layout.visibility = 'hidden'

run_button.on_click(on_run_clicked)

# ------------------------------
# 7. Display Widgets
# ------------------------------

widgets_box = widgets.VBox([
    widgets.HBox([algorithm_dropdown, sub_choice_dropdown]),
    run_button,
    progress_bar_process,
    output_table,
    output
])

display(widgets_box)

VBox(children=(HBox(children=(Dropdown(description='Algorithm:', options=('autoML', 'search-based', 'feature-o…

In [None]:
# ... (Import statements and preprocessing functions remain the same) ...

# ------------------------------
# 3. Algorithm and Sub-Algorithm Mapping
# ------------------------------

# Define a dictionary to map algorithms and sub-algorithms to their functions
algorithm_map = {
    'autoML': {
        'TPOT': run_automl_tpot,  # Replace with your actual TPOT function
        'auto-sklearn': run_automl_auto_sklearn,  # Replace with your actual auto-sklearn function
        'H2O.ai': run_automl_h2o  # Replace with your actual H2O.ai function
    },
    'search-based': {
        'GridSearchCV': search_based_algorithm_grid_search,
        'SMBO': run_smbo  # Replace with your actual SMBO function
    },
    'feature-ordering': {
        'random forest classifier': feature_based_ordering_random_forest,  # Replace with your actual Random Forest function
        'Gradient boosting': feature_based_ordering_gradient_boosting  # Replace with your actual Gradient Boosting function
    }
}

# ... (Algorithm runner functions remain the same) ...

# ------------------------------
# 6. Main Run Function (Modified)
# ------------------------------

def on_run_clicked(b):
    # ... (File loading and initial checks remain the same) ...

    try:
        # ... (Progress bar setup remains the same) ...

        # Get the selected algorithm and sub-algorithm
        algorithm = algorithm_dropdown.value
        sub_algorithm = sub_choice_dropdown.value

        # Get the corresponding function from the algorithm map
        selected_function = algorithm_map.get(algorithm, {}).get(sub_algorithm)

        # Check if the function is found
        if selected_function:
            # Call the selected function with the loaded data
            processed_data = selected_function(loaded_data)
        else:
            raise ValueError(f"No function found for algorithm '{algorithm}' and sub-algorithm '{sub_algorithm}'")

        # ... (Rest of the on_run_clicked function remains the same) ...

## 6.Download and load "Phi-3.5-mini-instruct" Language Model

## 7.Define finetuning for Language model

## 8.Apply fine-tuning on two types of datasets


*   Raw-dataset
*   preprocessed dataset



## 9.Define Metrics

## 10.Define Evaluation

## 11.Visualization the results



In [None]:
# 1. Upload and load dataset
import pandas as pd
import io

from google.colab import files

print("Please upload your dataset")
uploaded = files.upload()

for filename in uploaded.keys():
    try:
        df = pd.read_csv(io.StringIO(uploaded[filename].decode('utf-8')))
            print("Dataset loaded sucessfully!")
            print(df.head())



Please upload your dataset


Saving ScimagoJR Journals - 1999.csv to ScimagoJR Journals - 1999 (1).csv
Dataset isn't contain numerical and text values
