In [14]:
import pandas as pd
import re

In [15]:
def to_snake_case(string):
    string = re.sub('\s+', r'_', string)   
    return string.lower()

In [16]:
data= pd.read_csv("../../Data/Dataset_v1.csv",sep=";")

data= data.rename(to_snake_case, axis='columns')

data['sampling']=0


In [17]:
(data['type_of_property'].value_counts() / len(data), data['province'].value_counts() / len(data))

(House                 0.524733
 Apartment / Flat      0.334260
 Townhouse             0.140934
 Vacant Land / Plot    0.000073
 Name: type_of_property, dtype: float64,
 gauteng          0.449949
 western-cape     0.161496
 kwazulu-natal    0.139982
 north-west       0.076248
 mpumalanga       0.056856
 eastern-cape     0.039734
 free-state       0.038636
 limpopo          0.029197
 northern-cape    0.007903
 Name: province, dtype: float64)

In [18]:
def get_type_property_per_province_proportion(data=pd.DataFrame):
    total_house=  data[(data['type_of_property']=="House")].shape[0]
    total_apartement=  data[(data['type_of_property']=="Apartment / Flat")].shape[0]
    total_townhouse=  data[(data['type_of_property']=="Townhouse")].shape[0]
    provinces= data['province'].sort_values(ascending=True).unique().tolist()
    houses=[]
    houses_percentage=[]
    houses_all_percentage=[]
    apartments_percentage=[]
    apartments_all_percentage=[]
    apartments=[]
    townhouses=[]
    townhouses_percentage=[]
    townhouses_all_percentage=[]
    for province in provinces:
        temp_house= data[(data['type_of_property']=="House") & (data['province']==province)].shape[0]
        temp_apartement= data[(data['type_of_property']=="Apartment / Flat") & (data['province']==province)].shape[0]
        temp_townhouse= data[(data['type_of_property']=="Townhouse") & (data['province']==province)].shape[0]

        houses.append(temp_house)
        houses_percentage.append(temp_house*100/total_house)
        houses_all_percentage.append(temp_house*100/len(data))
        apartments.append(temp_apartement)
        apartments_percentage.append(temp_apartement*100/total_apartement)
        apartments_all_percentage.append(temp_apartement*100/len(data))
        townhouses.append(temp_townhouse)
        townhouses_percentage.append(temp_townhouse*100/total_townhouse)
        townhouses_all_percentage.append(temp_townhouse*100/len(data))

    provinces_data=pd.DataFrame([houses,houses_percentage,houses_all_percentage,apartments,apartments_percentage,apartments_all_percentage,townhouses,townhouses_percentage,townhouses_all_percentage],\
        index=["House","House %","House % over all properties","Apartment / Flat","Apartment / Flat %","Apartment / Flat % over all properties","Townhouse","Townhouse %","Townhouse % over all properties"],columns=provinces)

    return provinces_data


Utilisation de la fonction stratified_sample venant du site https://www.kaggle.com/flaviobossolan/stratified-sampling-python avec modification selon le besoin

In [19]:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True, sampling_code=0):
    '''
    It samples data from a pandas dataframe using strata. These functions use
    proportionate stratification:
    n1 = (N1/N) * n
    where:
        - n1 is the sample size of stratum 1
        - N1 is the population size of stratum 1
        - N is the total population size
        - n is the sampling size
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    :seed: sampling seed
    :keep_index: if True, it keeps a column with the original population index indicator
    
    Returns
    -------
    A sampled pandas dataframe based in a set of strata.
    Examples
    --------
    >> df.head()
    	id  sex age city 
    0	123 M   20  XYZ
    1	456 M   25  XYZ
    2	789 M   21  YZX
    3	987 F   40  ZXY
    4	654 M   45  ZXY
    ...
    # This returns a sample stratified by sex and city containing 30% of the size of
    # the original data
    >> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
    Requirements
    ------------
    - pandas
    - numpy
    '''
    population = len(df[df['sampling']==0])
    tmp = df[strata].where(data['sampling']==0)

    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()

    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)


    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']


            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        qry = qry + ' & sampling==0' #using only the data that were not yet sampled
        
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            selected_index= stratified_df['index'].tolist()
            first = False
        else:
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            selected_index= tmp_df['index'].tolist()

            # stratified_df = stratified_df.append(tmp_df, ignore_index=True)
        
        df.loc[selected_index,'sampling']=sampling_code
        
    
    return df

In [20]:
data= pd.read_csv("../../Data/Dataset_v1.csv",sep=";")

data= data.rename(to_snake_case, axis='columns')

data['sampling']=0

sampling={"Train": {"code": 1, "size": round(len(data)*70 /100)}, 
            "Test": {"code": 2, "size": round(len(data)*15 /100)+1}, 
            "Validation": {"code": 3, "size": round(len(data)*15 /100)+1 }
        }


#generate sampling
for elt in sampling:
    sample= sampling[elt]
    print("size:",sample['size'], "population: ", len(data[data['sampling'] == 0]) )
    sample_df = stratified_sample(data, ['province','type_of_property'], size=sample['size'], seed=123, keep_index= True,sampling_code=sample['code'])  
    print("sample size:", len(sample_df[sample_df['sampling']==sample['code']]))

#verify if there are data that are not sampled
if len(data[data['sampling'] == 0]) > 0:
    sample_df = stratified_sample(data, ['province','type_of_property'], size=len(data[data['sampling'] == 0]), seed=123, keep_index= True,sampling_code=1)  


size: 9566 population:  13666
sample size: 9564
size: 2051 population:  4102
sample size: 2053
size: 2051 population:  2049
sample size: 2049


In [21]:
len(data[data['sampling'] == 3])


2049

In [22]:
get_type_property_per_province_proportion(data)

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,365.0,340.0,3149.0,709.0,289.0,547.0,545.0,95.0,1132.0
House %,5.089946,4.741319,43.912983,9.887045,4.030121,7.627946,7.600056,1.32478,15.785804
House % over all properties,2.670862,2.487926,23.042587,5.188058,2.114737,4.002634,3.987999,0.695156,8.283331
Apartment / Flat,130.0,84.0,1986.0,905.0,60.0,100.0,352.0,7.0,944.0
Apartment / Flat %,2.845884,1.838879,43.476357,19.811734,1.313485,2.189142,7.705779,0.15324,20.665499
Apartment / Flat % over all properties,0.951266,0.614664,14.532416,6.622274,0.439046,0.731743,2.575735,0.051222,6.907654
Townhouse,48.0,104.0,1014.0,299.0,50.0,130.0,144.0,6.0,131.0
Townhouse %,2.492212,5.399792,52.647975,15.524403,2.596054,6.74974,7.476636,0.311526,6.801661
Townhouse % over all properties,0.351237,0.761013,7.419874,2.187912,0.365872,0.951266,1.05371,0.043905,0.958583


In [23]:
get_type_property_per_province_proportion(data[data['sampling']==1])

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,255.0,238.0,2204.0,496.0,202.0,383.0,381.0,66.0,792.0
House %,5.082719,4.743871,43.930636,9.886386,4.026311,7.634044,7.59418,1.315527,15.786326
House % over all properties,2.666248,2.488499,23.044751,5.186115,2.112087,4.004601,3.983689,0.690088,8.281054
Apartment / Flat,91.0,59.0,1390.0,633.0,42.0,70.0,246.0,5.0,661.0
Apartment / Flat %,2.846419,1.84548,43.478261,19.799812,1.313732,2.189553,7.694714,0.156397,20.675633
Apartment / Flat % over all properties,0.951485,0.616897,14.533668,6.61857,0.439147,0.731911,2.572146,0.052279,6.911334
Townhouse,34.0,73.0,710.0,209.0,35.0,91.0,101.0,4.0,92.0
Townhouse %,2.520385,5.411416,52.631579,15.492958,2.594514,6.745738,7.487027,0.296516,6.819867
Townhouse % over all properties,0.3555,0.763279,7.423672,2.185278,0.365956,0.951485,1.056043,0.041824,0.961941


In [24]:
get_type_property_per_province_proportion(data[data['sampling']==2])

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,55.0,51.0,472.0,106.0,44.0,82.0,82.0,14.0,170.0
House %,5.111524,4.739777,43.866171,9.851301,4.089219,7.620818,7.620818,1.301115,15.799257
House % over all properties,2.679006,2.48417,22.990745,5.163176,2.143205,3.994155,3.994155,0.681929,8.280565
Apartment / Flat,20.0,12.0,298.0,136.0,9.0,15.0,53.0,1.0,142.0
Apartment / Flat %,2.915452,1.749271,43.440233,19.825073,1.311953,2.186589,7.725948,0.145773,20.699708
Apartment / Flat % over all properties,0.974184,0.58451,14.515343,6.624452,0.438383,0.730638,2.581588,0.048709,6.916707
Townhouse,7.0,16.0,152.0,45.0,8.0,20.0,22.0,1.0,20.0
Townhouse %,2.405498,5.498282,52.233677,15.463918,2.749141,6.872852,7.560137,0.343643,6.872852
Townhouse % over all properties,0.340964,0.779347,7.403799,2.191914,0.389674,0.974184,1.071603,0.048709,0.974184


In [25]:
get_type_property_per_province_proportion(data[data['sampling']==3])

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,55.0,51.0,473.0,107.0,43.0,82.0,82.0,15.0,170.0
House %,5.102041,4.730983,43.877551,9.925788,3.988868,7.606679,7.606679,1.391466,15.769944
House % over all properties,2.684236,2.489019,23.084431,5.22206,2.098585,4.001952,4.001952,0.732064,8.29673
Apartment / Flat,19.0,13.0,298.0,136.0,9.0,15.0,53.0,1.0,141.0
Apartment / Flat %,2.773723,1.89781,43.50365,19.854015,1.313869,2.189781,7.737226,0.145985,20.583942
Apartment / Flat % over all properties,0.927282,0.634456,14.54368,6.637384,0.439239,0.732064,2.586628,0.048804,6.881406
Townhouse,7.0,15.0,152.0,45.0,7.0,19.0,21.0,1.0,19.0
Townhouse %,2.447552,5.244755,53.146853,15.734266,2.447552,6.643357,7.342657,0.34965,6.643357
Townhouse % over all properties,0.34163,0.732064,7.418253,2.196193,0.34163,0.927282,1.02489,0.048804,0.927282


In [26]:
#adding to csv
data.to_csv("../../Data/Dataset_v2.csv",sep=";",index=False)