Verilerin Lineer Regresyon ile Tahmini

In [62]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score


In [119]:
df = pd.read_csv("hepsiemlak/data.csv")

In [120]:
df["city"] = df["location"].astype("category")
df["district"] = df["district"].astype("category")
df["neighborhood"] = df["neighborhood"].astype("category")
df["rooms"] = df["rooms"].astype(int)
df["area"] = df["area"].astype(int)
df["age"] = df["age"].astype(int)
df["floor"] = df["floor"].astype(int)
df["price"] = df["price"].astype(int)
df.drop(columns=["location"], inplace=True)

In [121]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          432 non-null    category
 1   district      432 non-null    category
 2   neighborhood  432 non-null    category
 3   rooms         432 non-null    int64   
 4   living_room   432 non-null    int64   
 5   area          432 non-null    int64   
 6   age           432 non-null    int64   
 7   floor         432 non-null    int64   
 8   price         432 non-null    int64   
dtypes: category(3), int64(6)
memory usage: 48.5 KB
None


In [157]:
categorical_features = ["city", "district", "neighborhood"]
numerical_features = ["rooms","area", "age", "floor"]

In [158]:
full_pipeline = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ])

In [159]:
# Load dataset
X = df.drop("price", axis=1)
y = df["price"]


In [160]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [161]:
model = Pipeline([
    ("preparation", full_pipeline),
    ("model", Ridge(alpha=1.0, random_state=42))
])

In [162]:
model.fit(X_train, y_train)

In [163]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [164]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 2518039606.284644
RMSE: 50180.071804299405
R^2: 0.2913272752166651


In [165]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

590
[-9.17942021e+03  1.74167751e+04 -1.13973089e+04  2.33307435e+04
 -3.99234936e+04  1.01288537e+05 -1.40851842e+04 -3.38294369e+03
 -4.15868633e+04 -2.60226746e+03 -1.39248432e+04 -1.40959482e+04
 -5.31663078e+03  3.19829692e+03 -1.07675439e+04  1.41351156e+03
  1.48018503e+03  1.34172671e+04  5.52838022e+03 -2.70530666e+04
 -1.56156108e+04 -6.65487864e+03 -1.88304239e+04  9.20978663e+03
  1.08253234e+04  6.94721166e+03  5.22658733e+02 -1.28618877e+03
  5.46547479e+02  1.09097646e+04  1.28701765e+04 -3.74429067e+03
  4.70667259e+03 -1.14725827e+02 -2.72305087e+02  6.55158765e+03
 -1.91490974e+02 -8.05641209e+03 -4.13746909e+02 -1.99571886e+02
  1.94956885e+03  4.04013143e+03  7.45102792e+03 -5.00900825e+03
  1.09764228e+03 -1.19023991e+04 -1.15695170e+04 -8.19266712e+02
  4.30915846e+03  4.92079572e+03  9.40683643e+03 -1.42624061e+03
  1.45173758e+03 -4.16542567e+03 -1.53139009e+03 -2.42290650e+03
 -2.75493890e+02  1.62960446e+03 -5.28338291e+03 -1.24371667e+04
 -2.47346765e+03  5.6

In [167]:
print("Numerical Features:")
for i in range(len(numerical_features)):
    print(f"{numerical_features[i]}: {feature_importances[i]}")

Numerical Features:
rooms: -9179.42020915656
area: 17416.77507687507
age: -11397.30887959742
floor: 23330.743462874794


In [171]:
print("Feature importances:")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features)+j])

Feature importances:
adana-cukurova-belediye-evleri-kiralik -39923.49364214614
adana-cukurova-guzelyali-kiralik 101288.53700149996
adana-cukurova-kabasakal-kiralik -14085.18415935337
adana-cukurova-karslilar-kiralik -3382.943694309514
adana-cukurova-yurt-kiralik -41586.86333461689
adana-saricam-buruk-cumhuriyet-kiralik -2602.2674569243322
adana-saricam-elif-su-uludag-kiralik -13924.843190062984
adana-saricam-gultepe-kiralik -14095.948205777748
adana-seyhan-resatbey-kiralik -5316.630779609575
ankara-altindag-aydinlikevler-kiralik 3198.2969180504188
ankara-cankaya-100-yil-kiralik -10767.543880971081
ankara-cankaya-asagi-ovecler-kiralik 1413.5115550148184
ankara-cankaya-ata-kiralik 1480.1850281891373
ankara-cankaya-aydinlar-kiralik 13417.267053936977
ankara-cankaya-bahcelievler-kiralik 5528.380215316069
ankara-cankaya-beytepe-kiralik -27053.06661882997
ankara-cankaya-buyukesat-kiralik -15615.610825893866
ankara-cankaya-camlitepe-kiralik -6654.878636237417
ankara-cankaya-cigdem-kiralik -18

In [173]:
new_data = pd.DataFrame({
    "city": ["Istanbul"],
    "district": ["Kadikoy"],
    "neighborhood": ["Moda"],
    "rooms": [3],
    "area": [120],
    "age": [5],
    "floor": [2]
})

print(model.predict(new_data))

[39378.12055106]
