## statistics for decision making- Anova and p-value test

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
property_df = pd.read_csv("property.csv")

In [3]:
property_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
property_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [5]:
property_df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


###  1. For the suburb Altona, it is postulated that a typical property sells for 800,000 dollars. Use the data at hand to test this assumption. Is the typical property price really 800,000 dollars or has it increased? Use a significance level of 5%.

H0: Mean of property sale price is $800,000.

H1: Mean of property sale price is greater than $800,000.

In [6]:
altona_df = property_df[property_df["Suburb"] =="Altona"]["Price"]

In [7]:
altona_mean = altona_df.mean()
altona_std = altona_df.std()
altona_n = len(altona_df)
expected_price = 800000
significance_level = 0.05

In [8]:
stats.norm.cdf(expected_price,altona_mean,altona_std) 

0.45245231921952184

In [9]:
altona_zscore = (expected_price - altona_mean ) / (altona_std) 

In [10]:
stats.norm.cdf(altona_zscore)

0.45245231921952184

In [11]:
altona_p_value = 1-stats.norm.cdf(altona_zscore) 

In [12]:
if altona_p_value < significance_level:
    print("Null Hypothesis is rejected")
else:
    print("Failed to reject Null Hypothesis")

Failed to reject Null Hypothesis


Since we failed to reject Null Hypothesis, we can say that there is not sufficient evidence that property prices in Altona
have increased

###  2  For the year 2016, is there any difference in prices of properties sold in the summer months vs winter months? Consider months from October till March as winter months and rest as summer months. Use a significance level of 5%. 

In [14]:
property_df['Date'] = pd.to_datetime(property_df['Date'], format='%d/%m/%Y') 

In [15]:
property_df['Month'] = property_df['Date'].dt.month 

In [16]:
def categoriser(month):
    if 4 <= month <= 9:
        return "Summer"
    else:
        return "Winter"               

In [17]:
property_df["Season"] = property_df['Month'].apply(categoriser)

In [18]:
df_2016 = property_df[property_df["Date"].dt.year == 2016]
df_2016.head()           

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Month,Season
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,2016-12-03,2.5,3067.0,...,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0,12,Winter
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,2016-02-04,2.5,3067.0,...,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0,2,Winter
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2016-06-04,2.5,3067.0,...,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0,6,Summer
5,Abbotsford,129 Charles St,2,h,941000.0,S,Jellis,2016-05-07,2.5,3067.0,...,181.0,,,Yarra,-37.8041,144.9953,Northern Metropolitan,4019.0,5,Summer
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,2016-05-07,2.5,3067.0,...,245.0,210.0,1910.0,Yarra,-37.8024,144.9993,Northern Metropolitan,4019.0,5,Summer


In [19]:
Summer_Sample = df_2016[df_2016["Season"] == "Summer"]["Price"]
Winter_Sample = df_2016[df_2016["Season"] == "Winter"]["Price"] 

H0: There is no difference in price of properties being sold in Summer and Winter months.

H1: There is a difference in price of properties being sold in Summer and Winter months.

In [20]:
statistic_lav, lavene_p_value = stats.levene(Summer_Sample, Winter_Sample)
lavene_p_value         

0.0026217307900145955

Since lavene p value is less than significance value of 0.05, we can say, the variances have a significant difference

In [21]:
t_stat_2016,p_val_2016 = stats.ttest_ind(Summer_Sample, Winter_Sample, equal_var=False) 

In [22]:
if p_val_2016 < significance_level:
    print("Null Hypothesis is rejected")
else:
    print("Fail to reject Null Hypothesis")

Null Hypothesis is rejected


Since Null Hypothesis is rejected we can say there is a significant difference in properties being sold in Summer and Winter months

### 3  For the suburb Abbotsford, what is the probability that out of 10 properties sold, 3 will not have a car parking? Use the column car in the dataset. Round off your answer to 3 decimal places.

In [23]:
Abbotsford_data = property_df[property_df["Suburb"] == "Abbotsford"] 

In [24]:
Q3_data = Abbotsford_data[Abbotsford_data["Car"] == 0] 

In [25]:
Q3_p_success = len(Q3_data)/len(Abbotsford_data)
Q3_p_success                                      

0.26785714285714285

In [26]:
Q3_trials = 10
Q3_Fav_outcomes = 3    

In [27]:
Q3_Probability = stats.binom.pmf(Q3_Fav_outcomes,Q3_trials,Q3_p_success)
Q3_Probability           

0.2600529293316224

In [28]:
round(Q3_Probability,3)    

0.26

We can say for the suburb Abbotsford, the probability that out of 10 properties sold, 3 will not have a car parking will be 26%

### 4. In the suburb Abbotsford, what are the chances of finding a property with 3 rooms? Round your answer to 3 decimal places.

We are going to use poisson pmf to get the probability in this question. For that we need mean of rooms in Abbotsford.

In [30]:
Abb_room_mean = Abbotsford_data["Rooms"].mean()
Abb_room_mean

2.4107142857142856

In [31]:
Exp_room_abb = 3

In [32]:
Q4_probability = stats.poisson.pmf(Exp_room_abb,Abb_room_mean)    
Q4_probability                 

0.20956852771677797

In [35]:
round(Q4_probability,3)           

0.21

We can say in the suburb Abbotsford the probability of finding a property with 3 rooms is 20.9% ~ 21%

### 5 In the suburb Abbotsford, what are the chances of finding a property with 2 bathrooms? Round your answer to 3 decimal places. 

We are going to use poisson pmf to get the probability in this question. For that we need mean of bathrooms in Abbotsford.

In [36]:
Abb_bathroom_mean = Abbotsford_data["Bathroom"].mean()
Abb_bathroom_mean

1.4107142857142858

In [37]:
Exp_bath_abb = 2

In [38]:
Q5_probability = stats.poisson.pmf(Exp_bath_abb,Abb_bathroom_mean)           
Q5_probability

0.24276311589411867

In [39]:
round(Q5_probability,3)                                       

0.243

We can say that, in the suburb Abbotsford, the chances of finding a property with 2 bathrooms is 24.3%

-------End------------