# pandas creating and editing columns using the function

In [2]:
import pandas as pd 

In [3]:
melb_df = pd.read_csv("data/melb_df_01.csv")
melb_df["Address"].head(3)

0       85 Turner St
1    25 Bloomburg St
2       5 Charles St
Name: Address, dtype: object

### .apply(function)

In [4]:
def get_street_type(address):
    exclude_list = ['N', 'S', 'W', 'E']
    address_list = address.split(" ")
    street_type = address_list[-1]
    if street_type in exclude_list:
        street_type = address_list[-2]
    if street_type == "Avenue" or street_type == "Ave":
        street_type = "Av"
    if street_type == "Boulevard":
        street_type = "Bvd"
    if street_type == "Parade":
        street_type = "Pde"
    return street_type

In [5]:
street_type = melb_df["Address"].apply(get_street_type)
print(street_type.head(5))


0    St
1    St
2    St
3    La
4    St
Name: Address, dtype: object


In [6]:
print(street_type.nunique())
print(street_type.value_counts(normalize=True))

52
Address
St           0.589985
Rd           0.208027
Ct           0.045066
Dr           0.032916
Av           0.026657
Gr           0.022901
Pde          0.016642
Pl           0.012445
Cr           0.011193
Cl           0.007364
La           0.004934
Bvd          0.004860
Tce          0.003461
Wy           0.002946
Cct          0.001841
Hwy          0.001767
Sq           0.000810
Crescent     0.000663
Cir          0.000515
Strand       0.000515
Esplanade    0.000442
Grove        0.000368
Gdns         0.000295
Fairway      0.000295
Mews         0.000295
Grn          0.000295
Crossway     0.000221
Righi        0.000221
Ridge        0.000147
Crofts       0.000147
Esp          0.000147
Victoria     0.000147
Athol        0.000074
Highway      0.000074
Grange       0.000074
Res          0.000074
Nook         0.000074
Glade        0.000074
Qy           0.000074
Cove         0.000074
East         0.000074
Dell         0.000074
Loop         0.000074
Terrace      0.000074
Gra          0.000074

In [7]:
popular_types = street_type.value_counts().nlargest(10).index
print(popular_types)

Index(['St', 'Rd', 'Ct', 'Dr', 'Av', 'Gr', 'Pde', 'Pl', 'Cr', 'Cl'], dtype='object', name='Address')


In [8]:
melb_df["StreetType"] = street_type.apply(lambda x: x if x in popular_types else "other")
display(melb_df["StreetType"])


0           St
1           St
2           St
3        other
4           St
         ...  
13575       Cr
13576       Dr
13577       St
13578       St
13579       St
Name: StreetType, Length: 13580, dtype: object

In [9]:
print(melb_df["StreetType"].nunique())
print(melb_df["StreetType"].value_counts(normalize=True))

11
StreetType
St       0.589985
Rd       0.208027
Ct       0.045066
Dr       0.032916
other    0.026804
Av       0.026657
Gr       0.022901
Pde      0.016642
Pl       0.012445
Cr       0.011193
Cl       0.007364
Name: proportion, dtype: float64


In [10]:
melb_df = melb_df.drop('Address', axis=1)

In [11]:
melb_df.head(5)

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom,...,Lattitude,Longtitude,Regionname,Propertycount,MeanRoomsArea,AreaRatio,MonthSale,AgeBuilding,WeekdaySale,StreetType
0,Abbotsford,2,h,1480000.0,S,Biggin,2016-12-03,2.5,3067,2,...,-37.7996,144.9984,Northern Metropolitan,4019,25.2,-0.231707,12,46,5,St
1,Abbotsford,2,h,1035000.0,S,Biggin,2016-02-04,2.5,3067,2,...,-37.8079,144.9934,Northern Metropolitan,4019,15.8,-0.32766,2,116,3,St
2,Abbotsford,3,h,1465000.0,SP,Biggin,2017-03-04,2.5,3067,3,...,-37.8093,144.9944,Northern Metropolitan,4019,18.75,0.056338,3,117,5,St
3,Abbotsford,3,h,850000.0,PI,Biggin,2017-03-04,2.5,3067,3,...,-37.7969,144.9969,Northern Metropolitan,4019,15.75,0.145455,3,47,5,other
4,Abbotsford,4,h,1600000.0,VB,Nelson,2016-06-04,2.5,3067,3,...,-37.8072,144.9941,Northern Metropolitan,4019,17.75,0.083969,6,2,5,St


In [12]:
def get_weekend(weekday):
    if weekday >= 5:
        return 1
    return 0
melb_df["Weekend"] = melb_df["WeekdaySale"].apply(get_weekend)
round(melb_df[melb_df["Weekend"] == 1]["Price"].mean(), 0)

np.float64(1081199.0)

In [13]:
popular_sellers = melb_df["SellerG"].value_counts().nlargest(49).index
melb_df["SellerG"] = melb_df["SellerG"].apply(lambda x: x if x in popular_sellers else "other")
print(melb_df[melb_df["SellerG"] == 'Nelson']["Price"].min())
print(melb_df[melb_df["SellerG"] == 'other']["Price"].min())

170000.0
131000.0


In [14]:
melb_df.to_csv("data/melb_df_02.csv", index=False)

### practice

In [15]:
test_series = pd.Series(
    data = [99, 41, 21, 3, 72]
)
display(test_series)

0    99
1    41
2    21
3     3
4    72
dtype: int64

In [16]:
test_series = pd.Series(
    data = ["Опыт работы 8 лет 3 месяца",
            "Опыт работы 3 года 5 месяцев",
            "Опыт работы 1 год 9 месяцев",
            "Опыт работы 3 месяца",
            "Опыт работы 6 лет"]
)


In [17]:
def get_experience(arg):
    arg = arg.split(" ")
    result = 0
    if arg[3] in ["лет", "год", "года"]:
        result += int(arg[2]) * 12
    elif arg[3] in ["месяц", "месяца", "месяцев"]:
        result += int(arg[2])
    try:    
        result += int(arg[4])  
    except IndexError:
        return result
    return result

In [18]:
display(test_series.apply(get_experience))

0    99
1    41
2    21
3     3
4    72
dtype: int64