In [None]:
import pandas as pd

# Specify the path to the CSV file
file_path = 'listings_2.csv'  # Replace with the actual file path

# Load the data into a DataFrame
df = pd.read_csv(file_path)

# List out the column names
column_names = df.columns.tolist()

# Print the column names
print(column_names)
print(len(column_names))

In [None]:
important_features = [
    'property_type','description', 'room_type', 'accommodates', 'bedrooms', 'beds', 'bathrooms_text', 'amenities',
    'latitude', 'longitude', 'neighbourhood_cleansed', 'price', 'minimum_nights', 'maximum_nights',
    'availability_30', 'availability_60', 'availability_90', 'availability_365',
    'number_of_reviews', 'review_scores_rating', 'reviews_per_month',
    'review_scores_cleanliness', 'review_scores_communication', 'review_scores_location', 'review_scores_value',
    'host_response_time', 'host_is_superhost', 'calculated_host_listings_count', 'instant_bookable'
]

# Create a new DataFrame with only the necessary columns
df_selected = df[important_features]

# Display the first few rows of the new DataFrame to verify
print(df_selected.head())

# Print the number of columns retained
print(len(df_selected))

In [None]:
# Remove rows with any NaN or missing values
df_cleaned = df_selected.dropna()

# Display the first few rows of the cleaned DataFrame to verify
print(df_cleaned.head())

# Print the number of rows and columns retained
print(f"Rows retained: {df_cleaned.shape[0]}, Columns retained: {df_cleaned.shape[1]}")

In [None]:
import nltk

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')  # This includes the necessary tokenizer for word_tokenize
nltk.download('wordnet')

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Load the data
file_path = 'listings_2.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Specify relevant columns for processing
textual_columns = ['description', 'neighborhood_overview', 'neighbourhood', 'property_type', 'room_type', 'bathrooms_text', 'amenities']
numerical_columns = ['accommodates', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']  # Example numerical columns
categorical_columns = ['host_is_superhost', 'instant_bookable']  # Example categorical columns

# Fill NaNs with empty strings for text columns and with 0 for numerical columns
df[textual_columns] = df[textual_columns].fillna('')
df[numerical_columns] = df[numerical_columns].fillna(0)

# Preprocess textual columns
def preprocess_text(text):
    text = re.sub(r'[\[\]\"]', '', text).lower()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Apply preprocessing to the specified textual columns
for col in textual_columns:
    df[f'{col}_cleaned'] = df[col].apply(preprocess_text)

# Combine cleaned text columns into a single 'combined_text' column
df['combined_text'] = df[[f'{col}_cleaned' for col in textual_columns]].apply(lambda x: ' '.join(x), axis=1)

# TF-IDF Vectorization on the combined text
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Display the shape of the TF-IDF matrix
print(f'TF-IDF Matrix Shape: {tfidf_matrix.shape}')

In [None]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(df[numerical_columns])

# One-Hot Encode categorical features
encoder = OneHotEncoder(sparse=True, handle_unknown='ignore')
categorical_features = encoder.fit_transform(df[categorical_columns])

# Combine all features: TF-IDF matrix, numerical features, and categorical features
combined_features = hstack([tfidf_matrix, numerical_features, categorical_features])

# Calculate cosine similarity on the combined feature matrix
similarity_matrix = cosine_similarity(combined_features)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def preprocess_query(query, nlp):
    """
    Preprocesses the user query to match the preprocessing done on the listings.
    
    Parameters:
        query (str): User input query in text form.
        nlp (spacy.lang.en.English): Loaded spaCy NLP model.
        
    Returns:
        str: Cleaned and processed query.
    """
    query = re.sub(r'[\[\]\"]', '', query).lower()
    doc = nlp(query)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

def find_most_suitable_listings(query, tfidf_vectorizer, combined_features, df, top_n=5):
    """
    Finds the most suitable listings for a given user query, including numerical and categorical data.
    
    Parameters:
        query (str): The user's input query.
        tfidf_vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
        combined_features (sparse matrix): Combined feature matrix of listings.
        df (pd.DataFrame): DataFrame containing the listings data.
        top_n (int): Number of top listings to return.
        
    Returns:
        pd.DataFrame: DataFrame containing the most suitable listings.
    """
    # Preprocess the user query
    query_processed = preprocess_text(query)
    
    # Vectorize the processed query using the fitted TF-IDF vectorizer
    query_vector = tfidf_vectorizer.transform([query_processed])
    
    # Create a combined feature vector for the query (text features only for this example)
    query_combined_features = hstack([query_vector, np.zeros((1, numerical_features.shape[1] + categorical_features.shape[1]))])
    
    # Calculate cosine similarity between the query and all listings
    similarity_scores = cosine_similarity(query_combined_features, combined_features).flatten()
    
    # Get indices of the top_n most similar listings
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Return the most similar listings as a DataFrame
    return df.iloc[top_indices]


# Example user query
user_query = "Looking for a cozy apartment in a quiet neighborhood with modern amenities, with atleast 4 rating"
# Find and display the most suitable listings
recommended_listings = find_most_suitable_listings(user_query, tfidf_vectorizer, tfidf_matrix, df)
print(recommended_listings[['description', 'neighborhood_overview', 'neighbourhood', 'property_type', 'room_type', 'review_scores_rating']])

In [None]:
pd.set_option('display.max_colwidth', None)

# Assuming 'recommended_listings' is the DataFrame with your results
# Example user query
user_query = "looking for house within 3000 SEK"

# Find and display the most suitable listings
recommended_listings = find_most_suitable_listings(user_query, tfidf_vectorizer, tfidf_matrix, df)

# Print the DataFrame with full text
print(recommended_listings[['description']])

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Load the data
file_path = 'listings_2.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Specify relevant columns for processing
textual_columns = ['description', 'neighborhood_overview', 'neighbourhood', 'property_type', 'room_type', 'bathrooms_text', 'amenities']
numerical_columns = ['accommodates', 'bedrooms', 'beds', 'price', 'number_of_reviews', 'review_scores_rating']  # Example numerical columns
categorical_columns = ['host_is_superhost', 'instant_bookable']  # Example categorical columns

# Fill NaNs with empty strings for text columns and with 0 for numerical columns
df[textual_columns] = df[textual_columns].fillna('')
df[numerical_columns] = df[numerical_columns].fillna(0)

# Clean and convert numerical columns where needed (e.g., 'price')
def clean_numeric(column):
    return pd.to_numeric(df[column].replace('[\$,]', '', regex=True), errors='coerce')

# Apply cleaning to 'price' column and any other similar columns
df['price'] = clean_numeric('price')
df[numerical_columns] = df[numerical_columns].fillna(0)  # Handle any NaNs resulting from conversion

# Preprocess textual columns
def preprocess_text(text):
    text = re.sub(r'[\[\]\"]', '', text).lower()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

# Apply preprocessing to the specified textual columns
for col in textual_columns:
    df[f'{col}_cleaned'] = df[col].apply(preprocess_text)

# Combine cleaned text columns into a single 'combined_text' column
df['combined_text'] = df[[f'{col}_cleaned' for col in textual_columns]].apply(lambda x: ' '.join(x), axis=1)

# Initialize TF-IDF Vectorizer and fit on combined text
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Standardize numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(df[numerical_columns])

# One-Hot Encode categorical features
encoder = OneHotEncoder(sparse=True, handle_unknown='ignore')
categorical_features = encoder.fit_transform(df[categorical_columns])

# Combine all features: TF-IDF matrix, numerical features, and categorical features
combined_features = hstack([tfidf_matrix, numerical_features, categorical_features])

# Calculate cosine similarity on the combined feature matrix
similarity_matrix = cosine_similarity(combined_features)



In [None]:
import numpy as np
def find_most_suitable_listings(query, tfidf_vectorizer, combined_features, df, top_n=5):
    """
    Finds the most suitable listings for a given user query, including numerical and categorical data.
    
    Parameters:
        query (str): The user's input query.
        tfidf_vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
        combined_features (sparse matrix): Combined feature matrix of listings.
        df (pd.DataFrame): DataFrame containing the listings data.
        top_n (int): Number of top listings to return.
        
    Returns:
        pd.DataFrame: DataFrame containing the most suitable listings.
    """
    # Preprocess the user query
    query_processed = preprocess_text(query)
    
    # Vectorize the processed query using the fitted TF-IDF vectorizer
    query_vector = tfidf_vectorizer.transform([query_processed])
    
    # Create a combined feature vector for the query (text features only for this example)
    query_combined_features = hstack([query_vector, np.zeros((1, numerical_features.shape[1] + categorical_features.shape[1]))])
    
    # Calculate cosine similarity between the query and all listings
    similarity_scores = cosine_similarity(query_combined_features, combined_features).flatten()
    
    # Get indices of the top_n most similar listings
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    
    # Return the most similar listings as a DataFrame
    return df.iloc[top_indices]

# Example user query
user_query = "I want to party"
# Find and display the most suitable listings
recommended_listings = find_most_suitable_listings(user_query, tfidf_vectorizer, combined_features, df)
print(recommended_listings[['description', 'amenities', 'review_scores_rating']])

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load your data
df = pd.read_csv('listings_2.csv')

# Combine relevant features into 'description'
df = df.dropna(subset=['description', 'neighborhood_overview', 'neighbourhood', 
                       'property_type', 'room_type', 'bathrooms_text', 'amenities'])

df['description'] = (
    df['description'] + ' ' + df['neighborhood_overview'] + ' ' + df['neighbourhood'] + ' ' +
    df['property_type'] + ' ' + df['room_type'] + ' ' + df['bathrooms_text'] + ' ' + df['amenities']
)

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create embeddings for the descriptions directly
listings_embeddings = model.encode(df['description'].tolist())

# Define recommendation function
def get_recommendations(query, embeddings, df, top_n=5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    return df.iloc[top_indices]

# Example user query
query = "Looking for a modern apartment with a great view"
# Get recommendations
recommendations = get_recommendations(query, listings_embeddings, df)

# Adjust display settings to show full text
pd.set_option('display.max_colwidth', None)

# Print the entire line for each recommendation
print(recommendations[['description', 'neighborhood_overview', 'neighbourhood', 'property_type', 'room_type']])

In [None]:
# Example user query
query = "Big house"
# Get recommendations
recommendations = get_recommendations(query, listings_embeddings, df)

# Adjust display settings to show full text
pd.set_option('display.max_colwidth', None)

# Print the entire line for each recommendation
print(recommendations[['description', 'neighborhood_overview', 'neighbourhood', 'property_type', 'room_type']])

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load your data
df = pd.read_csv('listings_2.csv')

# Combine relevant features into 'description'
df = df.dropna(subset=['description', 'neighborhood_overview', 'neighbourhood', 
                       'property_type', 'room_type', 'bathrooms_text', 'amenities'])

df['description'] = (
    df['description'] + ' ' + df['neighborhood_overview'] + ' ' + df['neighbourhood'] + ' ' +
    df['property_type'] + ' ' + df['room_type'] + ' ' + df['bathrooms_text'] + ' ' + df['amenities']
)

# Load a more powerful sentence transformer model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Create embeddings for the descriptions directly
listings_embeddings = model.encode(df['description'].tolist())


  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [19]:
# Define recommendation function
def get_recommendations(query, embeddings, df, top_n=5):
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    return df.iloc[top_indices]

# Example user query
query = "House in Stockholm with 2 bedrooms"
# Get recommendations
recommendations = get_recommendations(query, listings_embeddings, df)

# Adjust display settings to show full text
pd.set_option('display.max_colwidth', None)

# Print the entire line for each recommendation
print(recommendations[['description', 'neighborhood_overview', 'neighbourhood', 'property_type', 'room_type', 'bedrooms']])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [29]:
df['picture_url'][3]

'https://a0.muscache.com/pictures/2085606/7a706118_original.jpg'

In [25]:
processed_listings_with_original_descriptions.columns

NameError: name 'processed_listings_with_original_descriptions' is not defined

In [33]:
import pandas as pd
import numpy as np
import re

# Load the original and processed data
original_df = pd.read_csv('listings_2.csv')
processed_df = pd.read_csv('processed_listings.csv')

# Replace NaNs with an empty string and clean <br /> tags in 'description'
original_df['description'] = original_df['description'].fillna('').apply(lambda x: re.sub(r'<br\s*/?>', '\n', str(x)))

# Merge on a unique identifier (e.g., 'id') to correctly align descriptions
# Assuming 'id' is the unique column that matches both DataFrames
processed_df = processed_df.merge(original_df[['id', 'description']], on='id', how='left', suffixes=('', '_original'))

# Replace the modified 'description' with the original 'description'
processed_df['description'] = processed_df['description_original']

# Drop the helper column if necessary
processed_df = processed_df.drop(columns=['description_original'])

# Save the updated processed DataFrame with the original descriptions
processed_df.to_csv('processed_listings_with_original_descriptions.csv', index=False)

In [21]:
#listings_embeddings = model.encode(df['description'].tolist())
import numpy as np
# Save the embeddings and DataFrame
np.save('listings_embeddings.npy', listings_embeddings)
df.to_csv('processed_listings.csv', index=False)