# ML-preprocessig steps to encode and feature engineer 

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

### Load data

In [2]:
path='C:/Zcommon/trainee/project_1/dataset_part2_amster.csv'
df=pd.read_csv(path)
df.head()

Unnamed: 0,id,last_scraped,name,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,...,calculated_host_listings_count_shared_rooms,reviews_per_month,has_reviews,first_review_missing,last_review_missing,days_since_first_review,days_since_last_review,superhost_missing,availability_missing,has_license
0,27886,2025-09-11,"Romantic, stylish B&B houseboat in canal district",2010-03-23,within an hour,100.0,97.0,1,1.0,1.0,...,0,1.87,1,0,0,5072,82,0,0,0
1,28871,2025-09-11,Comfortable double room,2010-05-13,within an hour,100.0,99.0,1,2.0,2.0,...,0,3.99,1,0,0,5577,82,0,0,0
2,29051,2025-09-11,Comfortable single / double room,2010-05-13,within an hour,100.0,99.0,1,2.0,2.0,...,0,4.81,1,0,0,5371,81,0,0,0
3,49552,2025-09-11,Multatuli Luxury Guest Suite in top location,2010-09-06,within a few hours,100.0,93.0,1,1.0,2.0,...,0,3.36,1,0,0,5509,94,0,0,0
4,50263,2025-09-11,Central de Lux 2 bedrooms (4p) apt 125 sqm,2010-09-10,within an hour,100.0,100.0,0,1.0,1.0,...,0,0.97,1,0,0,5525,88,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5850 entries, 0 to 5849
Data columns (total 62 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            5850 non-null   int64  
 1   last_scraped                                  5850 non-null   object 
 2   name                                          5850 non-null   object 
 3   host_since                                    5850 non-null   object 
 4   host_response_time                            5850 non-null   object 
 5   host_response_rate                            5850 non-null   float64
 6   host_acceptance_rate                          5850 non-null   float64
 7   host_is_superhost                             5850 non-null   int64  
 8   host_listings_count                           5850 non-null   float64
 9   host_total_listings_count                     5850 non-null   f

### Encoding 'room_type', 'property-type' and 'neighbourhood_cleansed' as they have limited categorical values.

In [4]:
df=pd.get_dummies(df,columns=['room_type','property_type','neighbourhood_cleansed'])

### Deal with 'amenities' : This is very complex to deal with as there are many amenities associated with some airbnb's, so lets gather the top 30 amenities and create a feature for each.

#### Helping function

In [5]:
def parse_amenities(s):##Ai
    if pd.isna(s): return []
    s = str(s).strip()
    if s.startswith("{") and s.endswith("}"):
        s = s[1:-1]
    parts = [p.strip().strip('"').strip("'") for p in s.split(",") if p.strip()!='']
    return [p for p in parts if p]

# parse
amen_lists = df['amenities'].apply(parse_amenities)

# top-K
K = 30
counter = Counter()
for lst in amen_lists:
    counter.update(lst)
top_k = [a for a,c in counter.most_common(K)]

# create binary columns for top-K
for amen in top_k:
    col = "amenity__" + amen.replace(" ", "_").replace(":", "").replace("/", "_")
    df[col] = amen_lists.apply(lambda lst: 1 if amen in lst else 0)

# amenities count
df['amenities_count'] = amen_lists.apply(len)
df=df.drop(columns=['amenities'])

### Now for bathroom_text from this we just need to extract the type(shared or private) and create a new feature bathroom_private having binary answers.

In [6]:
def extract_bathroom_type(text):
    if pd.isna(text):
        return 1  # assume private unless "shared"
    t = text.lower()
    if "shared" in t:
        return 0
    return 1

df['bathroom_private'] = df['bathrooms_text'].apply(extract_bathroom_type)

# drop the original messy column
df = df.drop(columns=['bathrooms_text'])

### Lastly host_response_time, we use ordinal encoding to encode this feature.

In [7]:
mappings={
    'within an hour':4,'within a few hours':3,'within a day':2,'a few days or more':1,'unknown':0
}
df['encoded_host_response_time']=df['host_response_time'].map(mappings)
#dropping the original feature as its useless now.
df=df.drop(columns=['host_response_time'])

### Dealing with 'instant_bookable' and mapping the values.

In [8]:
df['instant_bookable']=df['instant_bookable'].map({'t':1,'f':0})

### Dealing with host identity verification and profile pic presence.


In [9]:
df['host_identity_verified']=df['host_identity_verified'].map({'t':1,'f':0})
df['host_has_profile_pic']=df['host_has_profile_pic'].map({'t':1,'f':0})

### Dealing with 'last_scraped' and 'host_since' and converting dates to numeric days values.

In [10]:
df['last_scraped']=pd.to_datetime(df['last_scraped'])
df['host_since']=pd.to_datetime(df['host_since'])
df['last_scraped']=(pd.Timestamp.today()-df['last_scraped']).dt.days 
df['host_since']=(pd.Timestamp.today()-df['host_since']).dt.days

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5850 entries, 0 to 5849
Columns: 172 entries, id to encoded_host_response_time
dtypes: bool(83), float64(24), int64(64), object(1)
memory usage: 4.4+ MB


In [12]:
df.to_csv('mlpreprocessed.csv',index=False)

# END