In [16]:
import numpy as np
import re
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RepeatedKFold
import seaborn as sns
import matplotlib.pyplot as plt
import joblib


In [17]:
# !pip install nbimporter
import nbimporter
from madlan_data_prep import prepare_data

In [19]:
import madlan_data_prep
dataset = 'C:\\Users\\rotes\\Desktop\\output_all_students_Train_v10.csv'
madlan_data_prep.prepare_data(dataset)

  },
  },


Unnamed: 0,City,type,city_area,room_number,Area,hasElevator,hasParking,hasBars,entrance_date,hasStorage,condition,hasAirCondition,hasBalcony,hasMamad,handicapFriendly,furniture,floor,total_floor,publishedDays,price
0,פתח תקווה,דירה,אם המושבות החדשה פתח תקווה,5.5,137,1,0,1,flexible,0,שמור,1,0,1,1,לא צויין,11,19,0,3600000
1,פתח תקווה,דירה,נווה גן פתח תקווה,3.0,84,1,0,1,flexible,1,שמור,1,0,1,0,לא צויין,6,9,0,2550000
2,פתח תקווה,דירה,קרית הרב סלומון פתח תקווה,4.0,120,1,1,1,flexible,1,חדש,1,1,1,0,חלקי,2,7,0,2650000
3,פתח תקווה,דירה,המרכז השקט פתח תקווה,3.5,110,1,0,1,flexible,0,משופץ,1,0,1,1,חלקי,2,6,0,2450000
4,פתח תקווה,דירה,כפר גנים ב פתח תקווה,4.5,120,1,1,0,flexible,0,משופץ,1,1,1,1,לא צויין,3,6,0,2720000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,רעננה,בית פרטי,2005 רעננה,9.5,350,0,1,0,flexible,1,שמור,1,1,1,0,אין,4,0,0,8200000
672,רעננה,דירה,מרכז דרום רעננה,4.0,110,1,1,0,flexible,1,חדש,1,1,1,1,לא צויין,4,6,0,3350000
673,רעננה,קוטג',קרית גנים רעננה,7.0,376,0,1,0,less_than_6 months,0,חדש,0,0,0,0,לא צויין,0,0,0,8500000
674,רעננה,דירה,לסטר רעננה,5.0,126,1,1,0,flexible,0,חדש,1,1,1,1,לא צויין,5,7,0,3850000


In [20]:
#pip install nbimporter

In [None]:

# אתה רוצה להוציא את הערכים הייחודיים בכל עמודה
unique_values = [dataset[column].unique() for column in dataset.columns]

# הדפסת הערכים הייחודיים
for column, values in zip(dataset.columns, unique_values):
    print(f"{column}: {values}")


In [None]:

correlation_matrix = dataset.corr()['price'].to_frame()
# שימוש במטריצת הקורלציה ליצירת מפת חום
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix , annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


# חישוב מטריצת הקורלציה
corr_matrix = dataset.corr()
print(correlation_matrix)
# שימוש במטריצת הקורלציה ליצירת מפת חום
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

#אין קורלציה גבוהה ולכן לא נוריד פיצרים

In [None]:

X = dataset.iloc[:, :-1]
y = dataset.loc[:, 'price'].values

rkf = RepeatedKFold(n_splits=10, random_state=42)
rkf.get_n_splits(X, y)
alpha=0.0001
l1_ratio=0.9
for train_index, test_index in rkf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

#dataset.to_csv("dataset1.csv", index=False ,encoding = 'utf-8-sig')
num_cols = [col for col in X_train.columns if X_train[col].dtypes != 'O']
cat_cols = [col for col in X_train.columns if X_train[col].dtypes == 'O']

numerical_pipeline = Pipeline([('scaling', StandardScaler())])
categorical_pipeline = Pipeline([
    ('categorical_imputation', SimpleImputer(strategy='constant', add_indicator=False, fill_value='missing')),
    ('one_hot_encoding', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

column_transformer = ColumnTransformer([
    ('numerical_preprocessing', numerical_pipeline, num_cols),
    ('categorical_preprocessing', categorical_pipeline, cat_cols)], remainder='drop')

pipe_preprocessing_model = Pipeline([
    ('preprocessing_step', column_transformer),
    ('model', ElasticNet(alpha=alpha, l1_ratio=l1_ratio))])
pipe_preprocessing_model.fit(X_train, y_train)
y_pred = pipe_preprocessing_model.predict(X_test)

def score_model(y_test, y_pred, model_name):
    MSE = mse(y_test, y_pred)
    RMSE = np.sqrt(MSE)
    R_squared = r2_score(y_test, y_pred)
    print(f"Model: {model_name}, RMSE: {np.round(RMSE, 2)}, R-Squared: {np.round(R_squared, 2)}")

score_model(y_test, y_pred, "ElasticNet")

joblib.dump(pipe_preprocessing_model, 'trained_model.pkl')