In [350]:
#רוני חכם, ת.ז- 318402054
#קישור לגיט- https://github.com/RonyHacham/ML_Regression 

In [351]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from datetime import datetime, timedelta

In [352]:
# קריאת הנתונים
data = pd.read_csv('C:/Users/ronyh/Downloads/dataset.csv')

In [370]:
# פונקציה להכנת הנתונים
def prepare_data(data):
    
    #טיפול בערכים חריגים 
    data= data[data['Price'] > 0]
    
    # החלפת ערך 'None' לערך None בכל העמודות
    data = data.applymap(lambda x: None if x == 'None' else x)
    
    # בדיקת סוג הנתונים והמרה רק אם הם מסוג string
    data['capacity_Engine'] = data['capacity_Engine'].apply(lambda x: x.replace(',', '') if isinstance(x, str) else x).astype(float)
    
    data = data[data['Km'] != 'None']
    data['Km'] = data['Km'].str.replace(',', '').astype(float)
    
    # בדיקת מספר הערכים הריקים והמלאים בעמודה 'Test'
    num_missing = data['Test'].isnull().sum()
    num_non_missing = data['Test'].notnull().sum()
    print(f"Number of missing values in 'Test': {num_missing}")
    print(f"Number of non-missing values in 'Test': {num_non_missing}")
    if (0.5*num_missing>num_non_missing):
        print("Due to a lack of data, we will delete the column")
    #Due to a lack of data, we will delete the column
    data.drop('Test', axis=1, inplace=True)
    
    # בדיקת מספר הערכים הריקים והמלאים בעמודה 'Supply_score'
    num_missing = data['Supply_score'].isnull().sum()
    num_non_missing = data['Supply_score'].notnull().sum()
    print(f"Number of missing values in 'Supply_score': {num_missing}")
    print(f"Number of non-missing values in 'Supply_score': {num_non_missing}")
    if (0.5*num_missing>num_non_missing):
        print("Due to a lack of data, we will delete the column")
    #Due to a lack of data, we will delete the column
    data.drop('Supply_score', axis=1, inplace=True)
    
    #לא צריך 2 עמודות של תאריכים, נמחק את תאריך ההקפצה
    data.drop('Repub_date', axis=1, inplace=True)
    
    # המרת תאריך לתאריך-זמן עם טיפול בתאריכים לא תקינים
    def safe_parse(date_str):
        try:
            return pd.to_datetime(date_str)
        except ValueError:
            return None
    
    # החלת הפונקציה על עמודות התאריך
    data['Cre_date'] = data['Cre_date'].apply(safe_parse)

    # פונקציה לבדיקת השנים מאז תאריך מסוים
    def calculate_years_since(date_str):
        try:
            cre_date = pd.to_datetime(date_str)
            today = datetime.today()
            delta = today - cre_date
            years_since = delta.days / 365.25  # חלוקה ל-365.25 לצורך שנה לקפיצה
            return round(years_since)
        except ValueError:
            return None

    # שימוש בפונקציה ליצירת עמודה חדשה עבור Cre_date
    data['Cre_date-years'] = data['Cre_date'].apply(calculate_years_since)

    
    # פונקציה לבדיקת השנים מאז תאריך מסוים
    def calculate_years_since(date_str):
        try:
            cre_date = pd.to_datetime(date_str)
            today = datetime.today()
            delta = today - cre_date
            years_since = delta.days / 365.25  # חלוקה ל365.25 לצורך שנה לקפיצה
            return round(years_since)
        except ValueError:
            return None

    # שימוש בפונקציה ליצירת עמודה חדשה עבור Cre_date
    data['Cre_date-years'] = data['Cre_date'].apply(calculate_years_since)

    # הדפסת הנתונים לבדיקה
    print(data[['Cre_date', 'Cre_date-years']].head())

    # הפרדת משתנה המטרה (מחיר) מהתכונות
    X = data.drop('Price', axis=1)  
    y = data['Price']
    # זיהוי תכונות מספריות וקטגוריות
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    
    # בניית Pipeline עבור תכונות מספריות
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # בניית Pipeline עבור תכונות קטגוריות
    categorical_transformers = []
    for cat_feature in categorical_features:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='error', sparse=False, drop='first'))  # שימוש ב-drop='first' ו-handle_unknown='error'
        ])
        categorical_transformers.append((cat_feature, categorical_transformer, [cat_feature]))
    
    # שילוב הטרנספורמרים השונים באמצעות ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            *categorical_transformers
        ],
        remainder='drop'
    )
    
    # החלת הטרנספורמציות על הנתונים
    X_transformed = preprocessor.fit_transform(X)
    
    # יצירת DataFrame חדש עם הנתונים המעובדים
    numeric_attribs = list(numeric_features)
    cat_one_hot_attribs = []
    
    # הוספת שמות העמודות של OneHotEncoder
    for name, encoder in [(name, preprocessor.named_transformers_[name][1]) for name, _, _ in categorical_transformers]:
        cat_one_hot_attribs.extend([f"{name}_{category}" for category in encoder.categories_[0][1:]])
    
    # שילוב שמות העמודות המספריות והקטגוריאליות
    attributes = numeric_attribs + cat_one_hot_attribs
    
    # בניית DataFrame עם מספר העמודות הנכון
    X_processed = pd.DataFrame(data=X_transformed, columns=attributes)
    
    return X_processed,y  
# הכנת הנתונים
X_prepared,y = prepare_data(data)

Number of missing values in 'Test': 1368
Number of non-missing values in 'Test': 132
Due to a lack of data, we will delete the column
Number of missing values in 'Supply_score': 1061
Number of non-missing values in 'Supply_score': 439
Due to a lack of data, we will delete the column
    Cre_date  Cre_date-years
0 2023-11-07             1.0
1 2022-06-04             2.0
2 2022-10-29             2.0
3 2024-05-16             0.0
4 2022-06-13             2.0


In [356]:
X_prepared

Unnamed: 0,Year,Hand,capacity_Engine,Pic_num,Km,Cre_date-years,manufactor_אאודי,manufactor_אופל,manufactor_אלפא רומיאו,manufactor_ב.מ.וו,...,Color_לבן מטאלי,Color_לבן פנינה,Color_לבן שנהב,Color_סגול,Color_סגול חציל,Color_שחור,Color_שמפניה,Color_תכלת,Color_תכלת מטאלי,Color_None
0,0.171467,-0.284287,-0.076612,-0.232148,0.260218,-0.871844,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.958016,-1.098084,-0.567486,-0.807245,-0.643816,0.543487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.139446,-1.098084,-0.260689,-0.519696,0.272271,0.543487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.433650,-1.098084,-0.076612,0.630498,-1.146459,-2.287175,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.615081,-1.098084,-0.508581,-0.519696,-0.631763,0.543487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0.433650,-0.284287,-0.322049,-0.519696,-1.473358,0.543487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1496,-0.352898,1.343309,-0.322049,-0.807245,0.453078,0.543487,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1497,-0.090715,-1.098084,-0.322049,-0.519696,0.007088,0.543487,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1498,-0.877263,0.529511,-0.076612,-0.807245,-0.053181,-0.871844,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [357]:
y

0       51000.0
1       49000.0
2       22500.0
3       63000.0
4       37000.0
         ...   
1495    60000.0
1496    45000.0
1497    30000.0
1498    28000.0
1499    41500.0
Name: Price, Length: 1500, dtype: float64

In [359]:
#נבדוק האם הוואן הוט עבר טוב או שיש עמודה מיותרת
# מספר משתנים שונים בעמודה 'manufactor' לפני OneHot Encoding
unique_manufactors = data['manufactor'].nunique()
print(f"Number of unique manufactors before OneHot Encoding: {unique_manufactors}")

# מספר עמודות של 'manufactor' אחרי OneHot Encoding
manufactor_columns = [col for col in X_prepared.columns if col.startswith('manufactor_')]
print(f"Number of columns for manufactor after OneHot Encoding: {len(manufactor_columns)}")

Number of unique manufactors before OneHot Encoding: 27
Number of columns for manufactor after OneHot Encoding: 26


In [360]:
#נבדוק האם הוואן הוט עבר טוב או שיש עמודה מיותרת
# מספר משתנים שונים בעמודה 'model' לפני OneHot Encoding
unique_manufactors = data['model'].nunique()
print(f"Number of unique manufactors before OneHot Encoding: {unique_manufactors}")

# מספר עמודות של 'manufactor' אחרי OneHot Encoding
manufactor_columns = [col for col in X_prepared.columns if col.startswith('model')]
print(f"Number of columns for manufactor after OneHot Encoding: {len(manufactor_columns)}")

Number of unique manufactors before OneHot Encoding: 289
Number of columns for manufactor after OneHot Encoding: 288


In [361]:
#אם עבד כראוי עבור 2 עמודות קטגוריאליות(כלומר החזיר מספר עמודות שהוא מספר הקטגוריות פחות 1) נניח שעבד טוב עבור כלל העמודות הקטגורליות במעבר לוואן הוט

In [362]:
# חלוקת הנתונים לנתוני אימון ובדיקה
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42)

In [363]:
# בניית מודל Elastic Net
#alpha =0.1,l1=0.5 because it has the smallest RMSE
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  

In [364]:
# קרוס ולידציה עם 10 חלוקות
scores = cross_val_score(elastic_net, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

# חישוב השגיאה הריבועית הממוצעת על כל קרוס-ולידציה
rmse_cv = np.sqrt(-scores.mean())

print("Cross-validated Root Mean Squared Error:", rmse_cv)

Cross-validated Root Mean Squared Error: 16425.19714908536


In [365]:
# אימון המודל על כל נתוני האימון
elastic_net.fit(X_train, y_train)

# חיזוי על נתוני הבדיקה
y_pred = elastic_net.predict(X_test)

# חישוב השגיאה הריבועית הממוצעת על נתוני הבדיקה
mse_test = mean_squared_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)

print("Test Root Mean Squared Error:", rmse_test)

Test Root Mean Squared Error: 13967.196635211965


In [366]:
# איתור חמשת המאפיינים עם ההשפעה הגדולה ביותר על החיזוי
coef = elastic_net.coef_
feature_importance = pd.Series(coef, index=X_train.columns).sort_values(key=abs, ascending=False)
top_5_features = feature_importance.head(5)
print("Top 5 features with the greatest impact on prediction:")
print(top_5_features)

# ציון האם ההשפעה חיובית או שלילית
for feature, importance in top_5_features.items():
    impact = "Positive" if importance > 0 else "Negative"
    print(f"{feature}: {impact}")


Top 5 features with the greatest impact on prediction:
Year                 10627.283646
manufactor_מרצדס      5654.719389
manufactor_ב.מ.וו     4917.236754
manufactor_רנו       -4716.444451
manufactor_אופל      -4599.812079
dtype: float64
Year: Positive
manufactor_מרצדס: Positive
manufactor_ב.מ.וו: Positive
manufactor_רנו: Negative
manufactor_אופל: Negative


In [367]:
# חישוב מדד R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)

# חישוב הקורלציה הפירסונית
correlation = np.corrcoef(y_test, y_pred)[0, 1]
print("Pearson Correlation:", correlation)

# חישוב Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# חישוב השגיאה היחסית הממוצעת
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print("Mean Absolute Percentage Error:", mape)

R-squared: 0.6007274859082532
Pearson Correlation: 0.8146593021495095
Mean Absolute Error: 11407.276990637454
Mean Absolute Percentage Error: 28.17862172179733
