## The Dataset: Property prices in Tunisia
##Task: Price Prediction

In [None]:
import os
import urllib.request

DATA_PATH = "dataset"
DOWNLOAD_URL = "https://raw.githubusercontent.com/AhmedCheikhRouhou/Datasets/main/proptun.csv"

def fetch_data(url=DOWNLOAD_URL, path=DATA_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    urllib.request.urlretrieve(url , os.path.join(path, "data.csv"))

fetch_data()

**You can find your data in** `dataset/data.csv`

## Part 0: Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np



## Part 1 (1 Points): Load and split the data with pandas

In [None]:
# Load data
def load_dataset_data(dataset=DATA_PATH):
   db=os.path.join(dataset,"data.csv")
   return pd.read_csv(db)

dataset = load_dataset_data()


In [None]:
# Split Data
train_set, test_set =train_test_split(dataset, test_size=0.2, random_state=42)


# Part 2 (4 Points): Data Analysis

Use the train set to perform the data analysis (you can add any other analysis) 

1. Visualize the 5 first rows of the data

In [None]:
dataset.head()


Unnamed: 0,category,room_count,bathroom_count,size,type,price,city,region
0,Terrains et Fermes,,,,À Vendre,100000.0,Ariana,Raoued
1,Terrains et Fermes,,,,À Vendre,316000.0,Ariana,Autres villes
2,Appartements,2.0,1.0,80.0,À Louer,380.0,Ariana,Autres villes
3,Locations de vacances,1.0,1.0,90.0,À Louer,70.0,Ariana,Autres villes
4,Appartements,2.0,2.0,113.0,À Vendre,170000.0,Ariana,Ariana Ville


2. Show dataset information to see if there is any null values

In [None]:
dataset['category'].value_counts(ascending=True)

Colocations                                    65
Locations de vacances                         298
Bureaux et Plateaux                           461
Magasins, Commerces et Locaux industriels     651
Maisons et Villas                            3158
Terrains et Fermes                           3415
Appartements                                 4700
Name: category, dtype: int64

In [None]:
# Count the values in category column
dataset['city'].value_counts(ascending=True)

Kébili           11
Tataouine        17
Siliana          31
Tozeur           46
Sidi bouzid      53
Kasserine        59
Béja             63
Le kef           63
Gafsa            88
Jendouba         92
Kairouan        116
Zaghouan        181
Gabès           191
Médenine        230
Mahdia          268
Bizerte         455
Monastir        479
Sfax            646
La manouba      663
Nabeul          821
Ben arous      1123
Sousse         2270
Ariana         2374
Tunis          2408
Name: city, dtype: int64

In [None]:
# Count the values in city column
dataset['city'].value_counts(ascending=True)

Kébili           11
Tataouine        17
Siliana          31
Tozeur           46
Sidi bouzid      53
Kasserine        59
Béja             63
Le kef           63
Gafsa            88
Jendouba         92
Kairouan        116
Zaghouan        181
Gabès           191
Médenine        230
Mahdia          268
Bizerte         455
Monastir        479
Sfax            646
La manouba      663
Nabeul          821
Ben arous      1123
Sousse         2270
Ariana         2374
Tunis          2408
Name: city, dtype: int64

3. Show correlation between columns

In [None]:
print(dataset.corr())

                room_count  bathroom_count      size     price
room_count        1.000000        0.595932  0.451061 -0.000842
bathroom_count    0.595932        1.000000  0.474159 -0.004061
size              0.451061        0.474159  1.000000  0.021117
price            -0.000842       -0.004061  0.021117  1.000000


4. Add a new column named `log_price` where you calculate the log10 (using numpy) of the price, then show correlations



In [None]:
import numpy as np
dataset['log_price'] = np.log10(dataset['price'])

In [None]:
print(dataset.corr())


## Part 3: Data Cleaning (4 points)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
data_set_labels = dataset['log_price']
data_set_num= dataset.drop(["category","type","city","region","log_price"], axis=1)
# NumPipeline
num_pipeline = Pipeline([ 
 ('imputer', SimpleImputer(strategy="median")),
 ('std_scaler', StandardScaler() )])
num_attribs = list(data_set_num)
cat_attribs = ["category","type","city","region","log_price"]
ordinal=OneHotEncoder(sparse=False)
full_pipeline = ColumnTransformer([("num",num_pipeline,num_attribs),("cat",OneHotEncoder(),cat_attribs)])
dataset_prepared = full_pipeline.fit_transform(dataset)
print(dataset_prepared)

  (0, 0)	0.12123929423999193
  (0, 1)	-0.4188192610361682
  (0, 2)	-0.2579010774500918
  (0, 3)	-0.015655800681757467
  (0, 10)	1.0
  (0, 12)	1.0
  (0, 13)	1.0
  (0, 224)	1.0
  (0, 669)	1.0
  (1, 0)	0.12123929423999193
  (1, 1)	-0.4188192610361682
  (1, 2)	-0.2579010774500918
  (1, 3)	-0.01544332858859899
  (1, 10)	1.0
  (1, 12)	1.0
  (1, 13)	1.0
  (1, 44)	1.0
  (1, 916)	1.0
  (2, 0)	-0.5969570454307412
  (2, 1)	-0.4188192610361682
  (2, 2)	-0.49609008964553986
  (2, 3)	-0.015753793598055836
  (2, 4)	1.0
  (2, 11)	1.0
  (2, 13)	1.0
  :	:
  (12745, 2)	3.5531231176770777
  (12745, 3)	-0.013836016550539017
  (12745, 9)	1.0
  (12745, 12)	1.0
  (12745, 35)	1.0
  (12745, 173)	1.0
  (12745, 1194)	1.0
  (12746, 0)	0.12123929423999193
  (12746, 1)	-0.4188192610361682
  (12746, 2)	0.15892969389194245
  (12746, 3)	-0.01551808728804364
  (12746, 9)	1.0
  (12746, 12)	1.0
  (12746, 35)	1.0
  (12746, 173)	1.0
  (12746, 847)	1.0
  (12747, 0)	-0.5969570454307412
  (12747, 1)	-0.4188192610361682
  (1274

## Part 4: Model Training (4 points)

Use at least 2 models for regression and predict the log_price instead of the price:
- use cross validation for one of the models (1 pt)
- perform hyperparameter tuning in one of the models (1 pt)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
reg=DecisionTreeRegressor().fit(dataset_prepared, data_set_labels)
#cross validation 
from sklearn.model_selection import cross_val_score
scores=cross_val_score(reg,dataset_prepared,data_set_labels,scoring="neg_root_mean_squared_error",cv=10)
print(scores)

[-0.05025779 -0.04587816 -0.03765851 -0.08447515 -0.04840673 -0.04263805
 -0.07809731 -0.08929981 -0.06607886 -0.05695947]


## Part 5: Ensembling of the models (2 points)

Apply at least two model ensembling methods

In [None]:
#linear regression
display_scores(scores)
lin = LinearRegression().fit(dataset_prepared, data_set_labels)
scores_labels=cross_val_score(lin,dataset_prepared,data_set_labels,scoring="neg_root_mean_squared_error",cv=10)
display_scores(scores_labels)


# hyperparameter tuning
from sklearn.metrics import mean_squared_error
lin_reg = LinearRegression()
lin_reg.fit(dataset_prepared, data_set_labels)
dataset_predictions = lin_reg.predict(dataset_prepared)
lin_mse = mean_squared_error(data_set_labels, dataset_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


NameError: ignored

## Part 6: XGBoost (2 Points)

Use XGBoost with Cross Validation