# ماشین لرنینگ

## فصل رگرسیون خطی

### قسمت اول



In [240]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels
from sklearn.preprocessing import LabelEncoder , MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [241]:
df = pd.read_csv('/content/sample_data/HomeTehran.csv')
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [242]:
df.duplicated().sum()

np.int64(208)

In [243]:
df.drop_duplicates(inplace=True)

In [244]:
df.describe()

Unnamed: 0,Room,Price,Price(USD)
count,3271.0,3271.0,3271.0
mean,2.086518,5455162000.0,181838.7
std,0.763165,8244387000.0,274812.9
min,0.0,3600000.0,120.0
25%,2.0,1419500000.0,47316.67
50%,2.0,2960000000.0,98666.67
75%,2.0,6132000000.0,204400.0
max,5.0,92400000000.0,3080000.0


In [245]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3271 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3271 non-null   object 
 1   Room        3271 non-null   int64  
 2   Parking     3271 non-null   bool   
 3   Warehouse   3271 non-null   bool   
 4   Elevator    3271 non-null   bool   
 5   Address     3248 non-null   object 
 6   Price       3271 non-null   float64
 7   Price(USD)  3271 non-null   float64
dtypes: bool(3), float64(2), int64(1), object(2)
memory usage: 162.9+ KB


In [246]:
df['Area'] = df['Area'].astype(str).str.replace(",", "").str.strip().astype(np.int64)
for i in df.columns:
    if i == "Address":
        continue
    df[i] = df[i].astype(np.int64)

In [247]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3271 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Area        3271 non-null   int64 
 1   Room        3271 non-null   int64 
 2   Parking     3271 non-null   int64 
 3   Warehouse   3271 non-null   int64 
 4   Elevator    3271 non-null   int64 
 5   Address     3248 non-null   object
 6   Price       3271 non-null   int64 
 7   Price(USD)  3271 non-null   int64 
dtypes: int64(7), object(1)
memory usage: 230.0+ KB


In [248]:
address_mean_price = df.groupby("Address")["Price"].mean().reset_index()
address_mean_price['Price_zone'] = pd.qcut(address_mean_price['Price'], q=5, labels=['very low' , 'low', 'medium', 'high' , 'very high'])
address_mean_price.head()

Unnamed: 0,Address,Price,Price_zone
0,Abazar,8594130000.0,very high
1,Abbasabad,3230000000.0,medium
2,Absard,4233333000.0,high
3,Abuzar,1528333000.0,low
4,Afsarieh,1866667000.0,low


In [249]:
df = df.merge(address_mean_price[['Address', 'Price_zone']], on='Address', how='left')
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD),Price_zone
0,63,1,1,1,1,Shahran,1850000000,61666,medium
1,60,1,1,1,1,Shahran,1850000000,61666,medium
2,79,2,1,1,1,Pardis,550000000,18333,very low
3,95,2,1,1,1,Shahrake Qods,902500000,30083,very low
4,123,2,1,1,1,Shahrake Gharb,7000000000,233333,very high
...,...,...,...,...,...,...,...,...,...
3266,63,1,1,1,0,Feiz Garden,1890000000,63000,medium
3267,86,2,1,1,1,Southern Janatabad,3500000000,116666,medium
3268,83,2,1,1,1,Niavaran,6800000000,226666,very high
3269,105,2,1,1,1,Dorous,5600000000,186666,very high


In [250]:
encoder = LabelEncoder()
df['Price_zone_encoding'] = encoder.fit_transform(df['Price_zone'])
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD),Price_zone,Price_zone_encoding
0,63,1,1,1,1,Shahran,1850000000,61666,medium,2
1,60,1,1,1,1,Shahran,1850000000,61666,medium,2
2,79,2,1,1,1,Pardis,550000000,18333,very low,4
3,95,2,1,1,1,Shahrake Qods,902500000,30083,very low,4
4,123,2,1,1,1,Shahrake Gharb,7000000000,233333,very high,3
...,...,...,...,...,...,...,...,...,...,...
3266,63,1,1,1,0,Feiz Garden,1890000000,63000,medium,2
3267,86,2,1,1,1,Southern Janatabad,3500000000,116666,medium,2
3268,83,2,1,1,1,Niavaran,6800000000,226666,very high,3
3269,105,2,1,1,1,Dorous,5600000000,186666,very high,3


In [251]:
df['Address'] = encoder.fit_transform(df['Address'])
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD),Price_zone,Price_zone_encoding
0,63,1,1,1,1,156,1850000000,61666,medium,2
1,60,1,1,1,1,156,1850000000,61666,medium,2
2,79,2,1,1,1,117,550000000,18333,very low,4
3,95,2,1,1,1,152,902500000,30083,very low,4
4,123,2,1,1,1,150,7000000000,233333,very high,3
...,...,...,...,...,...,...,...,...,...,...
3266,63,1,1,1,0,52,1890000000,63000,medium,2
3267,86,2,1,1,1,163,3500000000,116666,medium,2
3268,83,2,1,1,1,105,6800000000,226666,very high,3
3269,105,2,1,1,1,39,5600000000,186666,very high,3


In [252]:
df.drop(['Price_zone' , 'Price(USD)'], axis=1, inplace=True)

In [253]:
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price_zone_encoding
0,63,1,1,1,1,156,1850000000,2
1,60,1,1,1,1,156,1850000000,2
2,79,2,1,1,1,117,550000000,4
3,95,2,1,1,1,152,902500000,4
4,123,2,1,1,1,150,7000000000,3
...,...,...,...,...,...,...,...,...
3266,63,1,1,1,0,52,1890000000,2
3267,86,2,1,1,1,163,3500000000,2
3268,83,2,1,1,1,105,6800000000,3
3269,105,2,1,1,1,39,5600000000,3


In [254]:
df.duplicated().sum()

np.int64(0)

In [255]:
for col in df.columns:
    if col == "Price":
        continue
    Q1 , Q3 = np.quantile(df[col] , [0.25 , 0.75])
    IQR = Q3 - Q1
    max_IQR = Q3 + 1.5 * IQR
    min_IQR = Q1 - 1.5 * IQR

    df.loc[df[col] > max_IQR , col] = np.nan
    df.loc[df[col] < min_IQR , col] = np.nan

    df.fillna(df[col].mean() , inplace=True)
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price_zone_encoding
0,63.0,2.0,1.0,1.0,1.0,156.0,1850000000,2.0
1,60.0,2.0,1.0,1.0,1.0,156.0,1850000000,2.0
2,79.0,2.0,1.0,1.0,1.0,117.0,550000000,4.0
3,95.0,2.0,1.0,1.0,1.0,152.0,902500000,4.0
4,123.0,2.0,1.0,1.0,1.0,150.0,7000000000,3.0
...,...,...,...,...,...,...,...,...
3266,63.0,2.0,1.0,1.0,1.0,52.0,1890000000,2.0
3267,86.0,2.0,1.0,1.0,1.0,163.0,3500000000,2.0
3268,83.0,2.0,1.0,1.0,1.0,105.0,6800000000,3.0
3269,105.0,2.0,1.0,1.0,1.0,39.0,5600000000,3.0


In [256]:
df['Price'] = np.log2(df['Price'])

In [257]:
Y = df['Price']
X = df.drop('Price' , axis=1)
X

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price_zone_encoding
0,63.0,2.0,1.0,1.0,1.0,156.0,2.0
1,60.0,2.0,1.0,1.0,1.0,156.0,2.0
2,79.0,2.0,1.0,1.0,1.0,117.0,4.0
3,95.0,2.0,1.0,1.0,1.0,152.0,4.0
4,123.0,2.0,1.0,1.0,1.0,150.0,3.0
...,...,...,...,...,...,...,...
3266,63.0,2.0,1.0,1.0,1.0,52.0,2.0
3267,86.0,2.0,1.0,1.0,1.0,163.0,2.0
3268,83.0,2.0,1.0,1.0,1.0,105.0,3.0
3269,105.0,2.0,1.0,1.0,1.0,39.0,3.0


In [258]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [259]:
model = LinearRegression()
model.fit(X_train, Y_train)

In [260]:
Y_pred_train = model.predict(X_train)
Y_pred_test = model.predict(X_test)

In [261]:
print(f"This Is a Mean Absolute ERROR Train: {mean_absolute_error(Y_train, Y_pred_train)}")
print(f"This Is a Mean Absolute ERROR Test: {mean_absolute_error(Y_test, Y_pred_test)}")

This Is a Mean Absolute ERROR Train: 0.9315238488202542
This Is a Mean Absolute ERROR Test: 0.8967491872011415


In [262]:
print(f"This Is a Mean Squared ERROR Train: {mean_squared_error(Y_train, Y_pred_train)}")
print(f"This Is a Mean Squared ERROR Test: {mean_squared_error(Y_test, Y_pred_test)}")

This Is a Mean Squared ERROR Train: 1.7201556167628214
This Is a Mean Squared ERROR Test: 1.5649570541271023


In [263]:
print(f"This Is a R2 Score Train: {r2_score(Y_train, Y_pred_train)}")
print(f"This Is a R2 Score Test: {r2_score(Y_test, Y_pred_test)}")

This Is a R2 Score Train: 0.3237336301436191
This Is a R2 Score Test: 0.34748726075482794
