In [None]:
import pandas as pd
from scipy import stats
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
import gdown
import warnings
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [None]:
!gdown --folder https://drive.google.com/drive/folders/1RZJdNpCfbMt2SB_kfWsxQgOk5LeuV7lw

Due to the memory limitations, the data was loaded in chunks of 200000 rows which are concatenated at the end, resulting in the "divar" dataframe with 1000000 records.

In [None]:
chunks = pd.read_csv("Divar Dataset/Divar.csv", chunksize=200000)
divar = pd.concat(chunks, ignore_index=True)
city_df = pd.read_csv("Divar Dataset/iran_city_classification.csv")

## 1. Comparison of Residential Property Sizes: Metropolises vs. Small Cities

As you can see, to access the type of cities, we need to merge two dataframes on "city_slug" column.

In [3]:
city_df.rename(columns={'نام شهر': 'city_slug', 'دسته‌بندی': 'city_type'}, inplace=True)
city_df["city_type"].value_counts()

city_type
شهر کوچک    231
کلان‌شهر      9
Name: count, dtype: int64

In [4]:
merged_df = pd.merge(divar, city_df, on='city_slug', how='left')
# Now we know the type of cities for each ads
merged_df[["cat2_slug", "title", "city_slug", "city_type"]].tail(5)

Unnamed: 0,cat2_slug,title,city_slug,city_type
999995,residential-sell,آپارتمان ۱۸۰ متری وحدت غربی,kermanshah,کلان‌شهر
999996,residential-rent,آپارتمان ۱۱۰ متری سعادت آباد دریا,tehran,کلان‌شهر
999997,residential-sell,منزل فروشی. خیابان انقلاب نرسیده به کارخانه...,yazd,شهر کوچک
999998,temporary-rent,مجتمع ویلایی کنار ساحل پاسداران,bandar-anzali,شهر کوچک
999999,residential-rent,۴۰ متر/ یک خواب /فول بازسازی,tehran,کلان‌شهر


Unfortunatelty, there are 36723 cities in our advertisements that their name are not in the second given dataframe. these records cannot contribute to our hypothesis test since we are not able to divide "building_size" data base on "city_type". </br>
**!!Note!!**: the "iran_city_classification.csv" is not a complete list of all small cities in Iran.

In [5]:
null_city_cat = merged_df[merged_df["city_type"].isnull() == True]
null_city_cat[["cat2_slug", "title", "city_slug", "city_type"]]

Unnamed: 0,cat2_slug,title,city_slug,city_type
1803,commercial-sell,فروش تجاری ۶۱.۵ متر دو بر,mehran,
1817,commercial-rent,اول خیابان کجو کف و دیوارها سرامیک نیم طبق بزرگ,fasa-city,
1858,residential-rent,اجاره واحد همکف مسکن مهر,nurabad,
1882,residential-sell,ویلا باغچه مدرن 211 متری شهرکی سند تک‌برگ,raheem-abad,
1899,residential-rent,اجاره ویلایی ۶۵ متری بازسازی شده,langarud,
...,...,...,...,...
999919,residential-sell,خانه سراب ایوان,eyvan,
999952,commercial-rent,رهن و اجاره ی مغازه هلال احمر(ورودی خیابان هیمن),kamyaran,
999964,residential-sell,فروش 13هزار متر زمین,bukan,
999978,residential-sell,۵۰۰متر زمین ۲۵۰ متر بنا,takestan,


### The alternative and null hypothesis</br>
---

Null hypothesis (H0): The average "building_size" in metropolitan cities is not smaller than in small cities.</br>

Alternative hypothesis (H1): The average "building_size in metropolitan cities is smaller than in non-metropolitan cities.</br>

$$
H_0:\ \mu_{\text{big city}} \ge \mu_{\text{small city}}
$$

$$
H_1:\ \mu_{\text{big city}} < \mu_{\text{small city}}
$$

Step 1: Data prepration based on our hypothesis

In [6]:
grouped = merged_df.groupby('city_type')

df_metro = grouped.get_group('کلان‌شهر')
df_small = grouped.get_group('شهر کوچک')

building_size_metro = df_metro[["city_type", "building_size"]].copy()
building_size_small = df_small[["city_type", "building_size"]].copy()

print("For now we will test our hypothesis with:")
print(f"Number of ads in metropolitan cities: {df_metro.shape[0]}")
print(f"Number of ads in small cities: {df_small.shape[0]}")
print(f"Total valid (before removong null values in building size) in this test: {df_metro.shape[0] + df_small.shape[0]}")

For now we will test our hypothesis with:
Number of ads in metropolitan cities: 464799
Number of ads in small cities: 498478
Total valid (before removong null values in building size) in this test: 963277


In [7]:
building_size_metro.head(2)

Unnamed: 0,city_type,building_size
0,کلان‌شهر,500.0
1,کلان‌شهر,60.0


In [8]:
building_size_small.head(2)

Unnamed: 0,city_type,building_size
8,شهر کوچک,78.0
10,شهر کوچک,87.0


Step 2: Checking the assumption of normality to decide which statistical test to use

In [9]:
# Before shapiro test for checking normality, the column "building_size" should not have null/nan values
cleaned_small = building_size_small["building_size"].dropna()
cleaned_metro = building_size_metro["building_size"].dropna()

# shapiro test for both datasets
stat_metro, pvalue_metro = stats.shapiro(cleaned_metro)
stat_small, pvalue_small = stats.shapiro(cleaned_small)

alpha = 0.05

print(f"P-value Metro: {pvalue_metro:.4f}")
print(f"P-value Small: {pvalue_small:.4f}")
print("=" * 50)

if pvalue_metro > alpha:
  print("Fail to Reject H0: The building_size dataset for metropolitan city IS likely NORMAL (P > alpha).")
else:
  print("Reject H0: The building_size dataset for metropolitan city IS NOT Normal (P <= alpha).")

if pvalue_small > alpha:
  print("Fail to Reject H0: The building_size dataset for small city IS likely NORMAL (P > alpha).")
else:
  print("Reject H0: The building_size dataset for small city IS NOT Normal (P <= alpha).")

P-value Metro: 0.0000
P-value Small: 0.0000
Reject H0: The building_size dataset for metropolitan city IS NOT Normal (P <= alpha).
Reject H0: The building_size dataset for small city IS NOT Normal (P <= alpha).


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


Step3: Both dataframes don't have normal distribution (expectedly). So,we should use **non-parametric** test like **Mann-Whitney** test instead of z_test or t_test.

In [10]:
stat, pvalue = stats.mannwhitneyu(cleaned_metro, cleaned_small, alternative='less')

print(f"P-value: {pvalue:.4f}")
print("=" * 50)

if pvalue < alpha:
  print("Reject H0. There is strong evidence that average building size in metropolitan cities is smaller.")
else:
  print("Fail to Reject H0. There is not enough evidence to support the hypothesis.")

P-value: 0.0000
Reject H0. There is strong evidence that average building size in metropolitan cities is smaller.


## 2. Comparison of Building Sizes Between Old and New Houses

We clean the `construction_year` feature by replacing non-numeric categories, converting Persian digits to English, removing missing values, and casting the column to integers. This allows us to perform numerical operations such as filtering old and new houses based on the construction year.

In [11]:
divar_1 = divar.copy()
divar_1.loc[divar_1['construction_year'] == 'قبل از ۱۳۷۰', 'construction_year'] = '۱۳۷۰'
def persian_to_english_number(s):
    if pd.isna(s):
        return None
    persian_digits = "۰۱۲۳۴۵۶۷۸۹"
    english_digits = "0123456789"
    return s.translate(str.maketrans(persian_digits, english_digits))
divar_1['construction_year'] = divar_1['construction_year'].apply(persian_to_english_number).astype(float)
divar_1 = divar_1[divar_1['construction_year'].notna()]
divar_1['construction_year'] = divar_1['construction_year'].astype(int)
divar_1[['construction_year']].value_counts()

construction_year
1403                 116260
1390                  59139
1402                  58424
1400                  53674
1395                  53029
1398                  38207
1397                  36326
1396                  35487
1401                  35328
1385                  34065
1399                  29594
1393                  29094
1392                  26130
1394                  26110
1388                  24268
1380                  23480
1370                  20637
1389                  16755
1391                  16316
1387                  14136
1386                  13468
1383                   9894
1384                   8494
1375                   7247
1382                   6965
1371                   5531
1381                   3590
1378                   3025
1379                   2415
1377                   2117
1372                   1914
1373                   1827
1376                   1593
1374                   1289
Name: count, dtype: int64

In [12]:
old_houses = divar_1[divar_1['construction_year'] < 1396]['building_size'].dropna()
new_houses = divar_1[divar_1['construction_year'] >= 1396]['building_size'].dropna()

We want to test the claim: "Houses built in the past were more spacious."
- Old houses: constructed before 1396  
- New houses: constructed from 1396 onward
- Variable of interest: building size (area of the house in square meters)

### The alternative and null hypothesis</br>
---

- **Null Hypothesis ($H_0$):** The average size of old houses ≤ the average size of new houses  
$$H_0: \mu_\text{old} \le \mu_\text{new}$$

- **Alternative Hypothesis ($H_a$):** The average size of old houses > the average size of new houses  
$$H_a: \mu_\text{old} > \mu_\text{new}$$

- This is a one-tailed test.

In [13]:
t_stat, p_value = ttest_ind(old_houses, new_houses, equal_var=False)
p_one_tailed = p_value / 2
t_stat, p_one_tailed

(np.float64(-2.021856179663078), np.float64(0.02159577035347379))

### Interpretation

`t_stat < 0` and `p_one_tailed < 0.05`: Reject $H_0$ → Old houses are on average larger  
The data suggest that new houses are on average slightly larger than old houses.

## 3. Impact of Business Deed on Commercial Property Prices

In this section, we investigate whether owning a business deed significantly affects the sale price of commercial properties. 
We focus on commercial properties listed for sale, including shops, offices, and industrial/agricultural businesses. 
Prices are compared between properties with and without a business deed using the Mann-Whitney U test, a non-parametric test suitable for comparing two independent groups.


In [14]:
df = divar.copy()
df_commercial_sale = df[
    (df['cat2_slug'] == 'commercial-sell') &
    (df['cat3_slug'].isin([
        'shop-sell', 'office-sell', 'industry-agriculture-business-sell'
    ]))
].copy()
df_valid = df_commercial_sale[
    df_commercial_sale['has_business_deed'].isin([True, False])
].copy()
df_valid = df_valid[df_valid['price_value'].notna()]
group_true = df_valid[df_valid['has_business_deed'] == True]['price_value']
group_false = df_valid[df_valid['has_business_deed'] == False]['price_value']
u_stat, p_value = mannwhitneyu(group_true, group_false, alternative='two-sided')
print("U-statistic:", u_stat)
print("p-value:", p_value)

if p_value < 0.05:
    print("Result: Reject H0 → Owning a business deed significantly affects the average price.")
else:
    print("Result: Fail to reject H0 → Owning a business deed does not significantly affect the average price.")

U-statistic: 113571227.0
p-value: 3.0447309449526764e-60
Result: Reject H0 → Owning a business deed significantly affects the average price.


## 4. Impact of Luxury and Non-Luxury Amenities on Property Prices

In this section, we examine whether the presence of certain amenities influences the property price. 
Amenities are divided into two categories: **luxury features** (pool, barbecue, sauna, jacuzzi) and **non-luxury features** (balcony, elevator, parking, warehouse). 
We use an independent two-sample t-test to compare the prices of properties with and without each feature to determine if the differences are statistically significant. 
Additionally, we check whether the distribution of property prices is approximately normal using the Shapiro-Wilk test.

In [15]:
df = divar.copy()

df_sales = df[df['price_value'] > 0].dropna(subset=['price_value'])

luxury_features = ['has_pool', 'has_barbecue', 'has_sauna', 'has_jacuzzi']
non_luxury_features = ['has_balcony', 'has_elevator', 'has_parking', 'has_warehouse']  

def test_feature_impact(df, feature):
    with_feature = df[df[feature] == True]['price_value']
    without_feature = df[df[feature] == False]['price_value']
    if len(with_feature) > 1 and len(without_feature) > 1:
        t_stat, p_value = stats.ttest_ind(with_feature, without_feature)
        return p_value < 0.05  
    return False

luxury_impact = [test_feature_impact(df_sales, f) for f in luxury_features]
print("Significant impact of luxury features:", all(luxury_impact))  

non_luxury_impact = [test_feature_impact(df_sales, f) for f in non_luxury_features]
print("Significant impact of non-luxury features:", any(non_luxury_impact))  

stat, p = stats.shapiro(df_sales['price_value'].sample(500))  
print("Price distribution normality:", p > 0.05)

Significant impact of luxury features: False
Significant impact of non-luxury features: True
Price distribution normality: False
