In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pickle
import json
matplotlib.rcParams["figure.figsize"] = (20, 10)

In [2]:
# Charger les données et les traiter (votre code de traitement des données)
file_path = "ira.csv"
data = pd.read_csv(file_path)

In [3]:
data2 = data.drop(['Title', 'Latitude', 'Longitude', 'Listing Views', 'Area', 'Features', 'Date of Construction'], axis=1)

In [4]:
data3 = data2[(data2["BER Rating"] != "SI_666") & (data2["Price"] != "Price on Appplication")]
data3.loc[:, "Price"] = data3["Price"].astype(str).apply(lambda x: x.replace("AMV:", "") if "AMV:" in x else x)
data3.loc[:, "Price"] = data3["Price"].astype(str).apply(lambda x: x.replace("From ", "") if "From " in x else x)
data3.loc[:, "Price"] = data3["Price"].astype(str).apply(lambda x: x.replace("Price on Application", "") if "Price on Application" in x else x)
data3 = data3[data3["Price"] != "AMV: Price on Application"]
data3["Price"] = data3["Price"].astype(str).apply(lambda x: ''.join([char for char in x if char.isdigit()]) if isinstance(x, str) else x)
data3["Price"] = pd.to_numeric(data3["Price"], errors='coerce')
data3["Number of Bedrooms"] = pd.to_numeric(data3["Number of Bedrooms"], errors='coerce')
data3 = data3.dropna()
data3 = data3[data3["BER Rating"] != "BER_PENDING"]

In [5]:
data4 = pd.get_dummies(data3, columns=["Property Type"])

def remove_rows(data4):
    data4 = data4[~((data4["Number of Bedrooms"] == 4) & (data4["Number of Bathrooms"] < 2))]
    return data4


In [6]:
data5 = remove_rows(data4)
def remove_rows_bath(data5):
    data5 = data5[data5["Number of Bathrooms"] != 0]
    return data5

In [7]:
data6 = remove_rows_bath(data5)
data6 = pd.get_dummies(data6, columns=["County"])

In [8]:
def filter_5(data6, floor_area=130):
    filtered_data = data6[~((data6["Floor Area (m2)"] < floor_area) & (data6["Number of Bedrooms"] == 5))]
    return filtered_data

data7 = filter_5(data6, 130)
def filter_4(data7, floor_area=100):
    filtered_data2 = data7[~((data7["Floor Area (m2)"] < floor_area) & (data7["Number of Bedrooms"] == 4))]
    return filtered_data2

In [9]:
data8 = filter_4(data7, 100)
data9 = pd.get_dummies(data8, columns=["BER Rating"])

In [10]:
new_columns = []
columns = data9.columns.tolist()
for col in columns:
    if col.startswith('BER Rating_'):
        new_columns.append(col.replace('BER Rating_', ''))
    elif col.startswith('County_'):
        new_columns.append(col.replace('County_', ''))
    elif col.startswith('Property Type_'):
        new_columns.append(col.replace('Property Type_', ''))
    else:
        new_columns.append(col)

data9.columns = new_columns

In [11]:
X = data9.drop(['Price'], axis=1)
y = data9.Price

In [12]:
# Convertir les noms de colonnes en minuscules
X.columns = [col.lower() for col in X.columns]
print(X.columns) # Vérifier les noms de colonnes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

Index(['number of bedrooms', 'number of bathrooms', 'floor area (m2)',
       'apartment', 'bungalow', 'detached', 'duplex', 'end of terrace',
       'house', 'semi-d', 'studio', 'terrace', 'townhouse', 'carlow', 'cavan',
       'clare', 'cork', 'donegal', 'dublin', 'fermanagh', 'galway', 'kerry',
       'kildare', 'kilkenny', 'laois', 'leitrim', 'limerick', 'longford',
       'louth', 'mayo', 'meath', 'monaghan', 'offaly', 'roscommon', 'sligo',
       'tipperary', 'waterford', 'westmeath', 'wexford', 'wicklow', 'a1', 'a2',
       'a3', 'b1', 'b2', 'b3', 'c1', 'c2', 'c3', 'd1', 'd2', 'e1', 'e2', 'f',
       'fg', 'g'],
      dtype='object')


In [13]:
gb_model = GradientBoostingRegressor(random_state=10)
gb_model.fit(X_train, y_train)


In [18]:
gb_model.score(X_train, y_train)

0.7545124259797379

In [14]:
def pridict_price(location, floor_area, number_of_bathrooms, number_of_bedrooms, property_type, ber_rating):
    loc_index = np.where(X.columns == location)[0][0] if location in X.columns else -1
    input_data = pd.DataFrame(np.zeros((1, len(X.columns))), columns=X.columns)
    input_data['number of bedrooms'] = number_of_bedrooms
    input_data['number of bathrooms'] = number_of_bathrooms
    input_data['floor area (m2)'] = floor_area
    if loc_index >= 0:
        input_data.iloc[0, loc_index] = 1
    property_type_col = f'{property_type.lower()}'
    if property_type_col in X.columns:
        input_data[property_type_col] = 1
    ber_rating_col = f'{ber_rating.lower()}'
    if ber_rating_col in X.columns:
        input_data[ber_rating_col] = 1
    return gb_model.predict(input_data)[0]

In [16]:
prediction = pridict_price('dublin', 150, 2, 3, 'apartment', 'b2')
print(f"Prédiction du prix : {prediction}")

Prédiction du prix : 735516.5078481463


In [19]:
with open("irish_model2.pickle", "wb") as f:
    pickle.dump(gb_model, f)

columns = {
    'data_columns': [col for col in X.columns]
}

In [20]:
with open("irish_model2.json", "w") as f:
    f.write(json.dumps(columns))