In [2]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
df=pd.read_csv('data_step3.csv', low_memory=False)

# Models

## Random forest Model

In [None]:
# Encode the non-numeric category column
label_encoder = LabelEncoder()

# Initialize dictionaries to store models, predictions, and best fold for each category
models = {}
predictions = {}
best_folds = {}
y_test_categories_list = {}
mape_per_category = {}  

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over unique categories
for idx, category in enumerate(df['دسته بندی'].unique()):
    
    # Create a DataFrame for the specific category (making a copy to avoid SettingWithCopyWarning)
    df_category = df[df['دسته بندی'] == category].copy()
    
    # Drop columns that are completely null for this category
    df_category.dropna(axis=1, how='all', inplace=True)
    
    # Convert all non-'price' columns to string type
    non_price_columns = df_category.columns[df_category.columns != 'price']
    df_category[non_price_columns] = df_category[non_price_columns].astype(str)

    # Iterate over each column in non_price_columns and apply label encoding
    for column in non_price_columns:
        df_category[column] = label_encoder.fit_transform(df_category[column])

    # Select features and target variable
    X_category = df_category.drop(['price', 'دسته بندی'], axis=1)  # Adjust as needed based on your actual columns
    y_category = df_category['price']


    # Lists to store MAPE for each fold
    mape_per_fold = []
    model_per_fold=[]
    prediction_per_fold=[]
    y_test_per_fold=[]

    # Perform k-fold cross-validation
    for fold_idx, (train_index, test_index) in enumerate(kf.split(X_category)):
        X_train_category, X_test_category = X_category.iloc[train_index], X_category.iloc[test_index]
        y_train_category, y_test_category = y_category.iloc[train_index], y_category.iloc[test_index]
    
        # Initialize and train the Random Forest Regressor for the category
        model_category = RandomForestRegressor(n_estimators=100, random_state=42)
        model_category.fit(X_train_category, y_train_category)
    
        # Make predictions for the category
        predictions_category_fold = model_category.predict(X_test_category)

        # Calculate MAPE for the fold
        mape_fold = mean_absolute_percentage_error(y_category.iloc[test_index], predictions_category_fold)
        
        # Store predictions and actuals for the fold
        mape_per_fold.append(mape_fold)
        model_per_fold.append(model_category)
        prediction_per_fold.append(predictions_category_fold)
        y_test_per_fold.append(y_category.iloc[test_index])

    # Find the fold with the lowest MAPE for the category
    best_fold_idx = np.argmin(mape_per_fold)

    # Store predictions and actuals for the best-performing fold of the category
    predictions[category] = prediction_per_fold[best_fold_idx]
    y_test_categories_list[category] = y_test_per_fold[best_fold_idx]
    # Store the model for the category
    models[category] = y_test_per_fold[best_fold_idx]
    print(f"MAPE Category {category}: Mean= {np.mean(mape_per_fold):.2f}%, Variance={np.var(mape_per_fold):.2f}%")          

    # if idx==5:
    #     break

# Combine predictions from the best-performing folds of the selected categories into a single array
selected_categories = list(models.keys())
selected_predictions = np.concatenate([predictions[category] for category in selected_categories])
selected_actuals = np.concatenate([y_test_categories_list[category] for category in selected_categories])

# Print overall MAPE for the selected models
mape_selected = np.mean(np.abs((selected_actuals - selected_predictions) / selected_actuals)) * 100
print(f'\n\nFinal Mean Absolute Percentage Error: {mape_selected:.2f}%')


## SVM model

In [None]:
# Encode the non-numeric category column
label_encoder = LabelEncoder()

# Initialize dictionaries to store models, predictions, and best fold for each category
models = {}
predictions = {}
best_folds = {}
y_test_categories_list = {}
mape_per_category = {}  

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over unique categories
for idx, category in enumerate(df['دسته بندی'].unique()):
    
    # Create a DataFrame for the specific category (making a copy to avoid SettingWithCopyWarning)
    df_category = df[df['دسته بندی'] == category].copy()
    
    # Drop columns that are completely null for this category
    df_category.dropna(axis=1, how='all', inplace=True)
    
    # Convert all non-'price' columns to string type
    non_price_columns = df_category.columns[df_category.columns != 'price']
    df_category[non_price_columns] = df_category[non_price_columns].astype(str)

    # Iterate over each column in non_price_columns and apply label encoding
    for column in non_price_columns:
        df_category[column] = label_encoder.fit_transform(df_category[column])

    # Select features and target variable
    X_category = df_category.drop(['price', 'دسته بندی'], axis=1)  # Adjust as needed based on your actual columns
    y_category = df_category['price']

    # Standardize the features and target using StandardScaler
    feature_scaler = StandardScaler()
    target_scaler = StandardScaler()
    # Standardize the features and target
    X_category_scaled = feature_scaler.fit_transform(X_category)
    y_category_scaled = target_scaler.fit_transform(y_category.values.reshape(-1, 1)).flatten()

    # Lists to store MAPE for each fold
    mape_per_fold = []
    model_per_fold=[]
    prediction_per_fold=[]
    y_test_per_fold=[]

    # Perform k-fold cross-validation
    for fold_idx, (train_index, test_index) in enumerate(kf.split(X_category_scaled)):
        X_train_category, X_test_category = X_category_scaled[train_index], X_category_scaled[test_index]
        y_train_category, y_test_category = y_category_scaled[train_index], y_category_scaled[test_index]
    
        # Initialize and train the Support Vector Regressor (SVR) for the category
        model_category = SVR(kernel='linear')  # You can choose different kernels such as 'linear', 'rbf', etc.
        model_category.fit(X_train_category, y_train_category)
    
        # Make predictions for the category
        predictions_category_fold = model_category.predict(X_test_category)
        
        # Inverse transform the scaled predictions to get them back to the original scale
        predictions_category_fold = target_scaler.inverse_transform(predictions_category_fold.reshape(-1, 1)).flatten()
        
        # # Store predictions and actuals for the fold

        # Calculate MAPE for the fold
        mape_fold = mean_absolute_percentage_error(y_category.iloc[test_index], predictions_category_fold)
        
        # Store predictions and actuals for the fold
        mape_per_fold.append(mape_fold)
        model_per_fold.append(model_category)
        prediction_per_fold.append(predictions_category_fold)
        y_test_per_fold.append(y_category.iloc[test_index])

    # Find the fold with the lowest MAPE for the category
    best_fold_idx = np.argmin(mape_per_fold)

    # Store predictions and actuals for the best-performing fold of the category
    predictions[category] = prediction_per_fold[best_fold_idx]
    y_test_categories_list[category] = y_test_per_fold[best_fold_idx]
    # Store the model for the category
    models[category] = y_test_per_fold[best_fold_idx]
    print(f"MAPE Category {category}: Mean= {np.mean(mape_per_fold):.2f}%, Variance={np.var(mape_per_fold):.2f}%")          

    # if idx==5:
    #     break

# Combine predictions from the best-performing folds of the selected categories into a single array
selected_categories = list(models.keys())
selected_predictions = np.concatenate([predictions[category] for category in selected_categories])
selected_actuals = np.concatenate([y_test_categories_list[category] for category in selected_categories])

# Print overall MAPE for the selected models
mape_selected = np.mean(np.abs((selected_actuals - selected_predictions) / selected_actuals)) * 100
print(f'\n\nFinal Mean Absolute Percentage Error: {mape_selected:.2f}%')


## XGBoost

In [None]:
# Encode the non-numeric category column
label_encoder = LabelEncoder()

# Initialize dictionaries to store models, predictions, and best fold for each category
models = {}
predictions = {}
best_folds = {}
y_test_categories_list = {}
mape_per_category = {}  

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over unique categories
for idx, category in enumerate(df['دسته بندی'].unique()):
    
    # Create a DataFrame for the specific category (making a copy to avoid SettingWithCopyWarning)
    df_category = df[df['دسته بندی'] == category].copy()
    
    # Drop columns that are completely null for this category
    df_category.dropna(axis=1, how='all', inplace=True)
    
    # Convert all non-'price' columns to string type
    non_price_columns = df_category.columns[df_category.columns != 'price']
    df_category[non_price_columns] = df_category[non_price_columns].astype(str)

    # Iterate over each column in non_price_columns and apply label encoding
    for column in non_price_columns:
        df_category[column] = label_encoder.fit_transform(df_category[column])

    # Select features and target variable
    X_category = df_category.drop(['price', 'دسته بندی'], axis=1)  # Adjust as needed based on your actual columns
    y_category = df_category['price']


    # Lists to store MAPE for each fold
    mape_per_fold = []
    model_per_fold=[]
    prediction_per_fold=[]
    y_test_per_fold=[]

    # Perform k-fold cross-validation
    for fold_idx, (train_index, test_index) in enumerate(kf.split(X_category)):
        X_train_category, X_test_category = X_category.iloc[train_index], X_category.iloc[test_index]
        y_train_category, y_test_category = y_category.iloc[train_index], y_category.iloc[test_index]
        
        #xgbOOST MODEL
        model_category = XGBRegressor(n_estimators=100, random_state=42)
        model_category.fit(X_train_category, y_train_category)
    
        # Make predictions for the category
        predictions_category_fold = model_category.predict(X_test_category)

        # Calculate MAPE for the fold
        mape_fold = mean_absolute_percentage_error(y_category.iloc[test_index], predictions_category_fold)
        
        # Store predictions and actuals for the fold
        mape_per_fold.append(mape_fold)
        model_per_fold.append(model_category)
        prediction_per_fold.append(predictions_category_fold)
        y_test_per_fold.append(y_category.iloc[test_index])

    # Find the fold with the lowest MAPE for the category
    best_fold_idx = np.argmin(mape_per_fold)

    # Store predictions and actuals for the best-performing fold of the category
    predictions[category] = prediction_per_fold[best_fold_idx]
    y_test_categories_list[category] = y_test_per_fold[best_fold_idx]
    # Store the model for the category
    models[category] = y_test_per_fold[best_fold_idx]
    print(f"MAPE Category {category}: Mean= {np.mean(mape_per_fold):.2f}%, Variance={np.var(mape_per_fold):.2f}%")          

    # if idx==5:
    #     break

# Combine predictions from the best-performing folds of the selected categories into a single array
selected_categories = list(models.keys())
selected_predictions = np.concatenate([predictions[category] for category in selected_categories])
selected_actuals = np.concatenate([y_test_categories_list[category] for category in selected_categories])

# Print overall MAPE for the selected models
mape_selected = np.mean(np.abs((selected_actuals - selected_predictions) / selected_actuals)) * 100
print(f'\n\nFinal Mean Absolute Percentage Error: {mape_selected:.2f}%')
