# **Data preparation**

## (1) Data import and inspection

In [14]:
import pandas as pd
import random 

In [15]:
df = pd.read_csv('airbnb.csv')
df.head()

Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,...,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,city,weekend
0,194.033698,Private room,False,True,2.0,False,1,0,10.0,93.0,...,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam,False
1,344.245776,Private room,False,True,4.0,False,0,0,8.0,85.0,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam,False
2,264.101422,Private room,False,True,2.0,False,0,1,9.0,87.0,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,amsterdam,False
3,433.529398,Private room,False,True,4.0,False,0,1,9.0,90.0,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam,False
4,485.552926,Private room,False,True,2.0,True,0,0,10.0,98.0,...,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,amsterdam,False


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51707 entries, 0 to 51706
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   realSum                     51707 non-null  float64
 1   room_type                   51707 non-null  object 
 2   room_shared                 51707 non-null  bool   
 3   room_private                51707 non-null  bool   
 4   person_capacity             51707 non-null  float64
 5   host_is_superhost           51707 non-null  bool   
 6   multi                       51707 non-null  int64  
 7   biz                         51707 non-null  int64  
 8   cleanliness_rating          51707 non-null  float64
 9   guest_satisfaction_overall  51707 non-null  float64
 10  bedrooms                    51707 non-null  int64  
 11  dist                        51707 non-null  float64
 12  metro_dist                  51707 non-null  float64
 13  attr_index                  517

In [17]:
df.describe()

Unnamed: 0,realSum,person_capacity,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
count,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0
mean,279.879591,3.161661,0.291353,0.350204,9.390624,92.628232,1.15876,3.191285,0.68154,294.204105,13.423792,626.856696,22.786177,7.426068,45.671128
std,327.948386,1.298545,0.45439,0.477038,0.954868,8.945531,0.62741,2.393803,0.858023,224.754123,9.807985,497.920226,17.804096,9.799725,5.249263
min,34.779339,2.0,0.0,0.0,2.0,20.0,0.0,0.015045,0.002301,15.152201,0.926301,19.576924,0.592757,-9.22634,37.953
25%,148.752174,2.0,0.0,0.0,9.0,90.0,1.0,1.453142,0.24848,136.797385,6.380926,250.854114,8.75148,-0.0725,41.39951
50%,211.343089,3.0,0.0,0.0,10.0,95.0,1.0,2.613538,0.413269,234.331748,11.468305,522.052783,17.542238,4.873,47.50669
75%,319.694287,4.0,1.0,1.0,10.0,99.0,1.0,4.263077,0.73784,385.756381,17.415082,832.628988,32.964603,13.518825,51.471885
max,18545.450285,6.0,1.0,1.0,10.0,100.0,10.0,25.284557,14.273577,4513.563486,100.0,6696.156772,100.0,23.78602,52.64141


In [18]:
df.shape

(51707, 21)

In [19]:
df.duplicated().sum()

0

Drop columns: attr_index, attr_index_norm, rest_index, rest_index_norm  
Change data type: multi, biz (from int to bool)

Preparation: create missing values, create duplicates, create a false datatype

## (2) Create missing values

In [20]:
random.seed(42)
missing = random.sample(range(0, 51706), 100)
df.loc[missing, "realSum"] = None
df.isna().sum()

realSum                       100
room_type                       0
room_shared                     0
room_private                    0
person_capacity                 0
host_is_superhost               0
multi                           0
biz                             0
cleanliness_rating              0
guest_satisfaction_overall      0
bedrooms                        0
dist                            0
metro_dist                      0
attr_index                      0
attr_index_norm                 0
rest_index                      0
rest_index_norm                 0
lng                             0
lat                             0
city                            0
weekend                         0
dtype: int64

## (3) Create duplicates

In [21]:
random.seed(40)
duplicates = random.sample(range(0, 51706), 20)
df_duplicates = df.loc[duplicates]
df = pd.concat([df, df_duplicates], ignore_index=True)
df.duplicated().sum()

20

## (4) Create a false data type

In [22]:
df['person_capacity'] = df['person_capacity'].apply(lambda x: f'"{x}"')

In [23]:
df['person_capacity'] = df['person_capacity'].astype(object)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51727 entries, 0 to 51726
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   realSum                     51627 non-null  float64
 1   room_type                   51727 non-null  object 
 2   room_shared                 51727 non-null  bool   
 3   room_private                51727 non-null  bool   
 4   person_capacity             51727 non-null  object 
 5   host_is_superhost           51727 non-null  bool   
 6   multi                       51727 non-null  int64  
 7   biz                         51727 non-null  int64  
 8   cleanliness_rating          51727 non-null  float64
 9   guest_satisfaction_overall  51727 non-null  float64
 10  bedrooms                    51727 non-null  int64  
 11  dist                        51727 non-null  float64
 12  metro_dist                  51727 non-null  float64
 13  attr_index                  517

## (5) Export modified dataframe

In [25]:
# df.to_csv('D:\DATAx_code\wiki-python\DATAx_self-study\\airbnb_modified.csv', index=False)

In [27]:
# try new df
new_df = pd.read_csv('airbnb_modified.csv')
print(new_df.info())
new_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51727 entries, 0 to 51726
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   realSum                     51627 non-null  float64
 1   room_type                   51727 non-null  object 
 2   room_shared                 51727 non-null  bool   
 3   room_private                51727 non-null  bool   
 4   person_capacity             51727 non-null  object 
 5   host_is_superhost           51727 non-null  bool   
 6   multi                       51727 non-null  int64  
 7   biz                         51727 non-null  int64  
 8   cleanliness_rating          51727 non-null  float64
 9   guest_satisfaction_overall  51727 non-null  float64
 10  bedrooms                    51727 non-null  int64  
 11  dist                        51727 non-null  float64
 12  metro_dist                  51727 non-null  float64
 13  attr_index                  517

Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,...,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,city,weekend
0,194.033698,Private room,False,True,"""2.0""",False,1,0,10.0,93.0,...,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam,False
1,344.245776,Private room,False,True,"""4.0""",False,0,0,8.0,85.0,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam,False
2,264.101422,Private room,False,True,"""2.0""",False,0,1,9.0,87.0,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,amsterdam,False
3,433.529398,Private room,False,True,"""4.0""",False,0,1,9.0,90.0,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam,False
4,485.552926,Private room,False,True,"""2.0""",True,0,0,10.0,98.0,...,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,amsterdam,False
