In [13]:
import pandas as pd
import re

In [14]:
def to_snake_case(string):
    string = re.sub('\s+', r'_', string)   
    return string.lower()

In [15]:
data= pd.read_csv("../../Data/Dataset_v1.csv",sep=";")

data= data.rename(to_snake_case, axis='columns')

data['sampling']=0


In [16]:
(data['type_of_property'].value_counts() / len(data), data['province'].value_counts() / len(data))

(House                 0.502214
 Apartment / Flat      0.350076
 Townhouse             0.147629
 Vacant Land / Plot    0.000080
 Name: type_of_property, dtype: float64,
 gauteng          0.453594
 western-cape     0.157772
 kwazulu-natal    0.133140
 north-west       0.079127
 mpumalanga       0.058923
 free-state       0.040248
 eastern-cape     0.039765
 limpopo          0.029220
 northern-cape    0.008211
 Name: province, dtype: float64)

In [17]:
def get_type_property_per_province_proportion(data=pd.DataFrame):
    total_house=  data[(data['type_of_property']=="House")].shape[0]
    total_apartement=  data[(data['type_of_property']=="Apartment / Flat")].shape[0]
    total_townhouse=  data[(data['type_of_property']=="Townhouse")].shape[0]
    provinces= data['province'].sort_values(ascending=True).unique().tolist()
    houses=[]
    houses_percentage=[]
    houses_all_percentage=[]
    apartments_percentage=[]
    apartments_all_percentage=[]
    apartments=[]
    townhouses=[]
    townhouses_percentage=[]
    townhouses_all_percentage=[]
    for province in provinces:
        temp_house= data[(data['type_of_property']=="House") & (data['province']==province)].shape[0]
        temp_apartement= data[(data['type_of_property']=="Apartment / Flat") & (data['province']==province)].shape[0]
        temp_townhouse= data[(data['type_of_property']=="Townhouse") & (data['province']==province)].shape[0]

        houses.append(temp_house)
        houses_percentage.append(temp_house*100/total_house)
        houses_all_percentage.append(temp_house*100/len(data))
        apartments.append(temp_apartement)
        apartments_percentage.append(temp_apartement*100/total_apartement)
        apartments_all_percentage.append(temp_apartement*100/len(data))
        townhouses.append(temp_townhouse)
        townhouses_percentage.append(temp_townhouse*100/total_townhouse)
        townhouses_all_percentage.append(temp_townhouse*100/len(data))

    provinces_data=pd.DataFrame([houses,houses_percentage,houses_all_percentage,apartments,apartments_percentage,apartments_all_percentage,townhouses,townhouses_percentage,townhouses_all_percentage],\
        index=["House","House %","House % over all properties","Apartment / Flat","Apartment / Flat %","Apartment / Flat % over all properties","Townhouse","Townhouse %","Townhouse % over all properties"],columns=provinces)

    return provinces_data


Utilisation de la fonction stratified_sample venant du site https://www.kaggle.com/flaviobossolan/stratified-sampling-python avec modification selon le besoin

In [18]:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True, sampling_code=0):
    '''
    It samples data from a pandas dataframe using strata. These functions use
    proportionate stratification:
    n1 = (N1/N) * n
    where:
        - n1 is the sample size of stratum 1
        - N1 is the population size of stratum 1
        - N is the total population size
        - n is the sampling size
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    :seed: sampling seed
    :keep_index: if True, it keeps a column with the original population index indicator
    
    Returns
    -------
    A sampled pandas dataframe based in a set of strata.
    Examples
    --------
    >> df.head()
    	id  sex age city 
    0	123 M   20  XYZ
    1	456 M   25  XYZ
    2	789 M   21  YZX
    3	987 F   40  ZXY
    4	654 M   45  ZXY
    ...
    # This returns a sample stratified by sex and city containing 30% of the size of
    # the original data
    >> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
    Requirements
    ------------
    - pandas
    - numpy
    '''
    population = len(df[df['sampling']==0])
    tmp = df[strata].where(data['sampling']==0)

    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()

    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)


    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']


            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        qry = qry + ' & sampling==0' #using only the data that were not yet sampled
        
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            selected_index= stratified_df['index'].tolist()
            first = False
        else:
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            selected_index= tmp_df['index'].tolist()

            # stratified_df = stratified_df.append(tmp_df, ignore_index=True)
        
        df.loc[selected_index,'sampling']=sampling_code
        
    
    return df

In [21]:
data= pd.read_csv("../../Data/Dataset_v1.csv",sep=";")

data= data.rename(to_snake_case, axis='columns')

data['sampling']=0

sampling={"Train": {"code": 1, "size": round(len(data)*70 /100)}, 
            "Test": {"code": 2, "size": round(len(data)*15 /100) }, 
            "Validation": {"code": 3, "size": round(len(data)*15 /100) }
        }


#generate sampling
for elt in sampling:
    sample= sampling[elt]
    print("size:",sample['size'], "population: ", len(data[data['sampling'] == 0]) )
    sample_df = stratified_sample(data, ['province','type_of_property'], size=sample['size'], seed=123, keep_index= True,sampling_code=sample['code'])  
    print("sample size:", len(sample_df[sample_df['sampling']==sample['code']]))


size: 8696 population:  12423
sample size: 8694
size: 1863 population:  3729
sample size: 1857
size: 1863 population:  1872
sample size: 1865


In [22]:
len(data[data['sampling'] == 0])


7

In [23]:
get_type_property_per_province_proportion(data)

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,321.0,315.0,2744.0,566.0,255.0,502.0,490.0,89.0,957.0
House %,5.145055,5.048886,43.981407,9.071967,4.087193,8.046161,7.853823,1.426511,15.338997
House % over all properties,2.583917,2.535619,22.088062,4.556065,2.052644,4.040892,3.944297,0.716413,7.703453
Apartment / Flat,125.0,82.0,1913.0,835.0,59.0,100.0,351.0,7.0,877.0
Apartment / Flat %,2.874224,1.885491,43.987123,19.199816,1.356634,2.299379,8.070821,0.160957,20.165555
Apartment / Flat % over all properties,1.006198,0.660066,15.398857,6.721404,0.474926,0.804959,2.825404,0.056347,7.059486
Townhouse,48.0,103.0,978.0,253.0,49.0,130.0,141.0,6.0,126.0
Townhouse %,2.61723,5.61614,53.326063,13.794984,2.671756,7.088332,7.688113,0.327154,6.870229
Townhouse % over all properties,0.38638,0.829107,7.872495,2.036545,0.39443,1.046446,1.134992,0.048298,1.014248


In [24]:
get_type_property_per_province_proportion(data[data['sampling']==1])

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,225.0,220.0,1921.0,396.0,178.0,351.0,343.0,62.0,670.0
House %,5.153459,5.038937,43.999084,9.070087,4.076958,8.039395,7.856161,1.420064,15.345854
House % over all properties,2.587992,2.530481,22.095698,4.554865,2.047389,4.037267,3.94525,0.713135,7.706464
Apartment / Flat,87.0,57.0,1339.0,584.0,41.0,70.0,246.0,5.0,614.0
Apartment / Flat %,2.859021,1.873151,44.002629,19.191587,1.347355,2.300361,8.084128,0.164312,20.177456
Apartment / Flat % over all properties,1.00069,0.655625,15.401426,6.717276,0.47159,0.805153,2.829538,0.057511,7.062342
Townhouse,34.0,72.0,685.0,177.0,34.0,91.0,99.0,4.0,88.0
Townhouse %,2.647975,5.607477,53.34891,13.785047,2.647975,7.087227,7.71028,0.311526,6.853583
Townhouse % over all properties,0.391074,0.828157,7.878997,2.035887,0.391074,1.046699,1.138716,0.046009,1.012192


In [25]:
get_type_property_per_province_proportion(data[data['sampling']==2])

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,48.0,47.0,411.0,85.0,38.0,75.0,73.0,13.0,143.0
House %,5.144695,5.037513,44.051447,9.110397,4.072883,8.038585,7.824223,1.393355,15.326902
House % over all properties,2.584814,2.530964,22.132472,4.577275,2.046311,4.038772,3.931072,0.700054,7.700592
Apartment / Flat,19.0,12.0,287.0,125.0,9.0,15.0,52.0,1.0,131.0
Apartment / Flat %,2.918587,1.843318,44.086022,19.201229,1.382488,2.304147,7.987711,0.15361,20.122888
Apartment / Flat % over all properties,1.023156,0.646204,15.455035,6.731287,0.484653,0.807754,2.800215,0.05385,7.054389
Townhouse,7.0,15.0,146.0,38.0,7.0,19.0,21.0,1.0,19.0
Townhouse %,2.564103,5.494505,53.479853,13.919414,2.564103,6.959707,7.692308,0.3663,6.959707
Townhouse % over all properties,0.376952,0.807754,7.862143,2.046311,0.376952,1.023156,1.130856,0.05385,1.023156


In [26]:
get_type_property_per_province_proportion(data[data['sampling']==3])

Unnamed: 0,eastern-cape,free-state,gauteng,kwazulu-natal,limpopo,mpumalanga,north-west,northern-cape,western-cape
House,48.0,48.0,410.0,85.0,39.0,76.0,74.0,14.0,143.0
House %,5.122732,5.122732,43.75667,9.071505,4.16222,8.110993,7.897545,1.49413,15.261473
House % over all properties,2.573727,2.573727,21.983914,4.557641,2.091153,4.075067,3.967828,0.75067,7.66756
Apartment / Flat,19.0,13.0,286.0,125.0,9.0,15.0,53.0,1.0,131.0
Apartment / Flat %,2.91411,1.993865,43.865031,19.171779,1.380368,2.300613,8.128834,0.153374,20.092025
Apartment / Flat % over all properties,1.018767,0.697051,15.335121,6.702413,0.482574,0.80429,2.841823,0.053619,7.024129
Townhouse,7.0,16.0,146.0,38.0,8.0,20.0,21.0,1.0,19.0
Townhouse %,2.536232,5.797101,52.898551,13.768116,2.898551,7.246377,7.608696,0.362319,6.884058
Townhouse % over all properties,0.375335,0.857909,7.828418,2.037534,0.428954,1.072386,1.126005,0.053619,1.018767
