In [37]:
import pandas as pd

# Try using the 'python' engine, which is more robust to errors
train = pd.read_csv('/train.csv', engine='python', on_bad_lines='skip')
test = pd.read_csv('/train.csv', engine='python', on_bad_lines='skip')

train.head()


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,beds,extra_people,minimum_nights,number_of_reviews,instant_bookable,amenities,property_type,room_type,cancellation_policy,price
0,947924,https://www.airbnb.com/rooms/947924,20180820000000.0,2018-08-16,LARGE LOVELY ROOM GREAT FOR GROUPS,,This large lovely room is perfect for any grou...,This large lovely room is perfect for any grou...,none,,...,4.0,$82.00,1,4.0,f,"{Internet,Wifi,Kitchen,""Free parking on premis...",House,Private room,strict_14_with_grace_period,$229.00
1,2747040,https://www.airbnb.com/rooms/2747040,20190720000000.0,2019-07-16,Alugo para a Copa do mundo,"Excelente apto, localizado a 500 metros de dis...",,"Excelente apto, localizado a 500 metros de dis...",none,,...,3.0,$0.00,10,0.0,f,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",Apartment,Entire home/apt,moderate,"$1,502.00"
2,13734316,https://www.airbnb.com/rooms/13734316,20181120000000.0,2018-11-15,Apartamento para temporada,Meu espaço é perto de Vila olímpica dos atleta...,,Meu espaço é perto de Vila olímpica dos atleta...,none,,...,3.0,$0.00,30,0.0,t,"{TV,""Air conditioning"",Pool,Kitchen,""Free park...",Apartment,Entire home/apt,flexible,$569.00
3,30979175,https://www.airbnb.com/rooms/30979175,20190920000000.0,2019-09-24,2-Ensuite flat with stunning sea view!,Enjoy the most beautiful sight of Recreio dos ...,The flat is newly refurbished and has a modern...,Enjoy the most beautiful sight of Recreio dos ...,none,,...,4.0,$75.00,2,20.0,f,"{TV,""Cable TV"",Wifi,""Air conditioning"",Pool,Ki...",Apartment,Entire home/apt,moderate,$499.00
4,3742926,https://www.airbnb.com/rooms/3742926,20181210000000.0,2018-12-14,Condomino,"Rio-Stadtteil JACAREPAGUA-CURICICA, kein Touri...",,"Rio-Stadtteil JACAREPAGUA-CURICICA, kein Touri...",none,,...,3.0,$88.00,7,0.0,f,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",House,Private room,flexible,$132.00


### Tratamento dos dados Train e Test


In [46]:
colunas_com_nulos_numericas = ['minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews_ltm','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms', 'host_listings_count', 'accommodates','bathrooms','bedrooms', 'beds','minimum_nights']

for coluna in colunas_com_nulos_numericas:
    train[coluna] = pd.to_numeric(train[coluna], errors='coerce')
    train[coluna] = train[coluna].fillna(train[coluna].median())

colunas_com_nulos_categoricas = ['host_is_superhost','instant_bookable','cancellation_policy']

for coluna in colunas_com_nulos_categoricas:
    modo = train[coluna].mode()[0]
    train[coluna] = train[coluna].fillna(modo)

colunas_com_cifrao = ['extra_people','price']

print(train.dtypes)

train['property_type'] = test['property_type'].apply(lambda x: x.replace('Lighthouse', 'House') if isinstance(x, object) else x)

# ------------------  Tratamento dos dados Teste ----------------------

colunas_com_nulos_numericas = ['minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews_ltm','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms', 'host_listings_count', 'accommodates','bathrooms','bedrooms', 'beds','minimum_nights']

for coluna in colunas_com_nulos_numericas:
    test[coluna] = pd.to_numeric(test[coluna], errors='coerce')
    test[coluna] = test[coluna].fillna(test[coluna].median())

colunas_com_nulos_categoricas = ['host_is_superhost','instant_bookable','cancellation_policy']

for coluna in colunas_com_nulos_categoricas:
    modo = test[coluna].mode()[0]
    test[coluna] = test[coluna].fillna(modo)

colunas_com_cifrao = ['extra_people']

print(test.dtypes)


id                                                int64
listing_url                                      object
scrape_id                                       float64
last_scraped                                     object
name                                             object
summary                                          object
space                                            object
description                                      object
experiences_offered                              object
neighborhood_overview                            object
minimum_minimum_nights                          float64
maximum_minimum_nights                          float64
minimum_maximum_nights                          float64
maximum_maximum_nights                          float64
minimum_nights_avg_ntm                          float64
maximum_nights_avg_ntm                          float64
number_of_reviews_ltm                           float64
calculated_host_listings_count_entire_homes     

##Criação do Modelo

In [48]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_features = [
    'minimum_minimum_nights',
    'maximum_minimum_nights',
    'minimum_maximum_nights',
    'maximum_maximum_nights',
    'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm',
    'number_of_reviews_ltm',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms',
    'host_listings_count',
    'latitude',
    'longitude',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'beds',
    'extra_people',
    'minimum_nights',
    'number_of_reviews'
]

categorical_features = [
    'experiences_offered',
    'host_is_superhost',
    'instant_bookable',
    'property_type',
    'room_type',
    'cancellation_policy'
]

X_train = train[categorical_features + numerical_features]
y_train = train['price']

X_test = test[categorical_features + numerical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model.fit(X_train_split, y_train_split)

y_val_pred = model.predict(X_val_split)

mse = mean_squared_error(y_val_split, y_val_pred)
r2 = r2_score(y_val_split, y_val_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

y_test_pred = pd.DataFrame(model.predict(X_test), columns=['price'])

test['predicted_price'] = y_test_pred

test.to_csv('predicted_prices.csv', index=False)

resultado = test[['id','predicted_price']]

resultado.rename(columns={'id': 'ID'}, inplace=True)

sequence = range(196030 + 1)

resultado['ID'] = sequence[:len(resultado)]

resultado.head()


Mean Squared Error: 38746.095199462376
R^2 Score: 0.1760992080646575


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resultado.rename(columns={'id': 'ID'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resultado['ID'] = sequence[:len(resultado)]


Unnamed: 0,ID,predicted_price
0,0,203.188033
1,1,362.919891
2,2,372.163111
3,3,330.037674
4,4,253.89068
