# Preprocessing data before Applied Machine Learning operations and Training.

In [77]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import warnings 
warnings.filterwarnings('ignore')

### Load data

In [78]:
path='C:/Zcommon/trainee/project_1/albany_nyc_cleansed_part_2.csv'
df=pd.read_csv(path)
df.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,...,reviews_per_month,has_reviews,first_review_missing,last_review_missing,days_since_first_review,days_since_last_review,has_availability_miss,superhost_missing,host_tenure_days,last_scraped_collected_days
0,unknown,100.0,50.0,0,1,5,1,1,THIRD WARD,42.65789,...,0.07,1,0,0,4164,1195,0,0,4704,50
1,within an hour,100.0,100.0,1,5,5,1,1,SIXTH WARD,42.65222,...,2.29,1,0,0,4119,112,0,0,4127,49
2,within an hour,100.0,99.0,0,2,2,1,1,SECOND WARD,42.64615,...,2.94,1,0,0,3853,70,0,0,3909,49
3,within an hour,100.0,100.0,1,5,5,1,1,SIXTH WARD,42.65222,...,2.64,1,0,0,3823,147,0,0,4127,50
4,unknown,100.0,100.0,1,1,1,1,1,SIXTH WARD,42.65559,...,5.58,1,0,0,3396,113,0,0,4158,50


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 59 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   host_response_time              425 non-null    object 
 1   host_response_rate              425 non-null    float64
 2   host_acceptance_rate            425 non-null    float64
 3   host_is_superhost               425 non-null    int64  
 4   host_listings_count             425 non-null    int64  
 5   host_total_listings_count       425 non-null    int64  
 6   host_has_profile_pic            425 non-null    int64  
 7   host_identity_verified          425 non-null    int64  
 8   neighbourhood_cleansed          425 non-null    object 
 9   latitude                        425 non-null    float64
 10  longitude                       425 non-null    float64
 11  property_type                   425 non-null    object 
 12  room_type                       425 

## One hot encoding 'room_type' and 'property_type' as it has limited categorical values.

In [80]:
df=pd.get_dummies(df,columns=['room_type','property_type','neighbourhood_cleansed'])


### Deal with 'amenities' : This is very complex to deal with as there are many amenities associated with some airbnb's, so lets gather the top 40 amenities and create a feature for each.

#### Helping function

In [81]:
def parse_amenities(s):##Ai
    if pd.isna(s): return []
    s = str(s).strip()
    if s.startswith("{") and s.endswith("}"):
        s = s[1:-1]
    parts = [p.strip().strip('"').strip("'") for p in s.split(",") if p.strip()!='']
    return [p for p in parts if p]

# parse
amen_lists = df['amenities'].apply(parse_amenities)

# top-K
K = 40
counter = Counter()
for lst in amen_lists:
    counter.update(lst)
top_k = [a for a,c in counter.most_common(K)]

# create binary columns for top-K
for amen in top_k:
    col = "amenity__" + amen.replace(" ", "_").replace(":", "").replace("/", "_")
    df[col] = amen_lists.apply(lambda lst: 1 if amen in lst else 0)

# amenities count
df['amenities_count'] = amen_lists.apply(len)

In [82]:
# Now drop 'amenities' containing raw data as its useless at this point.
df=df.drop(columns=['amenities'])

## Now for bathroom_text from this we just need to extract the type(shared or private) and create a new feature bathroom_private having binary answers.

In [83]:
def extract_bathroom_type(text):
    if pd.isna(text):
        return 1  # assume private unless "shared"
    t = text.lower()
    if "shared" in t:
        return 0
    return 1

df['bathroom_private'] = df['bathrooms_text'].apply(extract_bathroom_type)

# drop the original messy column
df = df.drop(columns=['bathrooms_text'])

## Lastly host_response_time, we use ordinal encoding to encode this feature.

In [84]:
mappings={
    'within an hour':4,'within a few hours':3,'within a day':2,'a few days or more':1,'unknown':0
}
df['encoded_host_response_time']=df['host_response_time'].map(mappings)
#dropping the original feature as its useless now.
df=df.drop(columns=['host_response_time'])

In [85]:
df.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,...,amenity__Toaster,amenity__Dishwasher,amenity__Hot_water_kettle,amenity__Bathtub,amenity__Shower_gel,amenity__Air_conditioning,amenity__Stove,amenities_count,bathroom_private,encoded_host_response_time
0,100.0,50.0,0,1,5,1,1,42.65789,-73.7537,4,...,0,0,0,0,0,1,0,8,1,0
1,100.0,100.0,1,5,5,1,1,42.65222,-73.76724,3,...,1,1,0,0,1,0,1,39,1,4
2,100.0,99.0,0,2,2,1,1,42.64615,-73.75966,2,...,1,0,1,0,1,1,1,40,1,4
3,100.0,100.0,1,5,5,1,1,42.65222,-73.76724,2,...,0,1,0,0,1,0,1,37,1,4
4,100.0,100.0,1,1,1,1,1,42.65559,-73.76506,4,...,0,1,0,0,1,0,1,33,1,0


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Columns: 131 entries, host_response_rate to encoded_host_response_time
dtypes: bool(35), float64(19), int64(77)
memory usage: 333.4 KB


In [87]:
df.to_csv('model_training_data.csv',index=False)

## Now we move to Model Training processes in the predictions notebook.

# End