<a href="https://colab.research.google.com/github/ShehanRajapaksha/Techtriathlon-MLmodels/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Model With Cosine simlarity and TF_IDF verctorization
The model is low in accuracy and can be improved - 0.44

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Load the visitor preferences and places datasets
visitor_pref_df = pd.read_csv('/content/vistor_preference.csv')
places_df = pd.read_csv('/content/places.csv')

# Step 1: Data Preprocessing
# Clean the 'Preferred Activities' and 'Bucket list destinations'
visitor_pref_df['Preferred Activities'] = visitor_pref_df['Preferred Activities'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))
visitor_pref_df['Bucket list destinations Sri Lanka'] = visitor_pref_df['Bucket list destinations Sri Lanka'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))

# Clean the places dataset and handle missing values
places_df['latest_reviews'] = places_df['latest_reviews'].fillna('')
places_df['user_ratings_total'] = places_df['user_ratings_total'].fillna(0)
places_df['rating'] = places_df['rating'].fillna(places_df['rating'].mean())  # Fill missing ratings with the average rating

# Combine the reviews and address to get a full description of the place for better vectorization
places_df['description'] = places_df['latest_reviews'] + " " + places_df['formatted_address']

# Step 2: Prepare the model inputs
# Combine preferred activities and bucket list as user preferences for matching
visitor_pref_df['combined_preferences'] = visitor_pref_df['Preferred Activities'].apply(lambda x: ' '.join(x)) + ' ' + visitor_pref_df['Bucket list destinations Sri Lanka'].apply(lambda x: ' '.join(x))

# Step 3: Train-Test Split
train_data, test_data = train_test_split(visitor_pref_df, test_size=0.2, random_state=42)

# Step 4: TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')

# Fit TF-IDF on training data (user preferences)
train_text = pd.concat([train_data['combined_preferences'], places_df['description']], axis=0)
train_tfidf_matrix = tfidf.fit_transform(train_text)

# Split the vectorized matrix back into user preferences (train) and places
train_user_tfidf_matrix = train_tfidf_matrix[:len(train_data)]
places_tfidf_matrix = train_tfidf_matrix[len(train_data):]

# Step 5: Cosine Similarity Calculation on Training Set
train_cosine_similarities = cosine_similarity(train_user_tfidf_matrix, places_tfidf_matrix)

# Step 6: Generate Training Recommendations
train_recommendations = []
for i, user_similarities in enumerate(train_cosine_similarities):
    # Multiply similarity score with place rating to prioritize higher-rated destinations
    scores = user_similarities * places_df['rating'].values
    top_indices = scores.argsort()[-5:][::-1]  # Get top 5 places
    recommended_places = places_df.iloc[top_indices][['name', 'formatted_address', 'rating']].values.tolist()

    train_recommendations.append({
        'User': train_data.iloc[i]['Name'],
        'Recommended Places': recommended_places
    })

# Step 7: Test the model with the test data
test_text = pd.concat([test_data['combined_preferences'], places_df['description']], axis=0)
test_user_tfidf_matrix = tfidf.transform(test_text[:len(test_data)])

# Calculate cosine similarities between test user preferences and places
test_cosine_similarities = cosine_similarity(test_user_tfidf_matrix, places_tfidf_matrix)

# Step 8: Generate Test Recommendations
test_recommendations = []
for i, user_similarities in enumerate(test_cosine_similarities):
    scores = user_similarities * places_df['rating'].values
    top_indices = scores.argsort()[-5:][::-1]  # Get top 5 places
    recommended_places = places_df.iloc[top_indices][['name', 'formatted_address', 'rating']].values.tolist()

    test_recommendations.append({
        'User': test_data.iloc[i]['Name'],
        'Recommended Places': recommended_places
    })

# Convert test recommendations to DataFrame for display
test_recommendations_df = pd.DataFrame(test_recommendations)

# Step 9: Define evaluation metrics (Precision@K and Recall@K)

def precision_at_k(user_recommendations, user_actual, k=5):
    """
    Calculate Precision@K.
    user_recommendations: List of places recommended for a user.
    user_actual: List of places that the user has actually visited or bucket list destinations (ground truth).
    k: Number of top recommendations to consider (default is 5).
    """
    if not user_actual:
        return 0

    top_k_recommendations = [rec[0] for rec in user_recommendations[:k]]  # Extract the place names from recommendations
    relevant_count = len(set(top_k_recommendations).intersection(set(user_actual)))

    precision = relevant_count / k
    return precision

def recall_at_k(user_recommendations, user_actual, k=5):
    """
    Calculate Recall@K.
    user_recommendations: List of places recommended for a user.
    user_actual: List of places that the user has actually visited or bucket list destinations (ground truth).
    k: Number of top recommendations to consider (default is 5).
    """
    if not user_actual:
        return 0

    top_k_recommendations = [rec[0] for rec in user_recommendations[:k]]  # Extract the place names from recommendations
    relevant_count = len(set(top_k_recommendations).intersection(set(user_actual)))

    recall = relevant_count / len(user_actual)
    return recall

# Step 10: Evaluate the model on the test data

precision_scores = []
recall_scores = []

for idx, (user_data, test_rec) in enumerate(zip(test_data.iterrows(), test_recommendations)):
    user_actual = user_data[1]['Bucket list destinations Sri Lanka']  # Ground truth
    user_recommendations = test_rec['Recommended Places']  # Model predictions

    precision = precision_at_k(user_recommendations, user_actual, k=5)
    recall = recall_at_k(user_recommendations, user_actual, k=5)

    precision_scores.append(precision)
    recall_scores.append(recall)

# Calculate average precision and recall for the test set
avg_precision = sum(precision_scores) / len(precision_scores)
avg_recall = sum(recall_scores) / len(recall_scores)

# Step 11: Output the Recommendations and Model Accuracy
print("Test Recommendations:")
print(test_recommendations_df.head())

print(f'\nAverage Precision@5: {avg_precision}')
print(f'Average Recall@5: {avg_recall}')


Test Recommendations:
             User  \
0  Darrell Parker   
1   Suzanne Olson   
2    Amy Buchanan   
3   April Griffin   
4  Nicole Alvarez   

                                                                                                                                                                                                                                                           Recommended Places  
0      [[Arugam Bay Beach, Arugam Bay Beach, Sri Lanka, 4.8], [Nuwara Eliya, Nuwara Eliya, Sri Lanka, 4.45943661971831], [Ahangama, Ahangama, Sri Lanka, 4.45943661971831], [Ahungalla, Ahungalla, Sri Lanka, 4.45943661971831], [Kandy, Kandy, Sri Lanka, 4.45943661971831]]  
1                                [[Sinharaja Forest Reserve, Sri Lanka, 4.3], [Horton Plains National Park, Sri Lanka, 4.7], [Nuwara Eliya, Nuwara Eliya, Sri Lanka, 4.45943661971831], [Kumana National Park, Okanda, Sri Lanka, 4.5], [Sigiriya, Sigiriya, Sri Lanka, 4.9]]  
2                                  

##Model trained via SK_learn using Random Forest method
High in accuracy for recommending a single place of recommendation and generates a diverse list of Top-N destinations - 0.799

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Load datasets
visitor_pref_df = pd.read_csv('/content/vistor_preference.csv')
places_df = pd.read_csv('/content/places.csv')

# Step 1: Data Preprocessing
# Clean the 'Preferred Activities' and 'Bucket list destinations'
visitor_pref_df['Preferred Activities'] = visitor_pref_df['Preferred Activities'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))
visitor_pref_df['Bucket list destinations Sri Lanka'] = visitor_pref_df['Bucket list destinations Sri Lanka'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))

# Clean the places dataset and handle missing values
places_df['latest_reviews'] = places_df['latest_reviews'].fillna('')  # Fill missing reviews with empty string
places_df['user_ratings_total'] = places_df['user_ratings_total'].fillna(0)  # Fill missing user ratings with 0
places_df['rating'] = places_df['rating'].fillna(places_df['rating'].mean())  # Fill missing ratings with the average rating

# Convert activities to a single string and use TF-IDF for more compact representation
visitor_pref_df['Preferred Activities Str'] = visitor_pref_df['Preferred Activities'].apply(lambda x: ' '.join(x))
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Limit the features for memory efficiency
activities_tfidf = tfidf_vectorizer.fit_transform(visitor_pref_df['Preferred Activities Str'])

# Explode 'Bucket list destinations Sri Lanka' so that each destination gets its own row
visitor_pref_df_exploded = visitor_pref_df.explode('Bucket list destinations Sri Lanka')

# Create target variable for prediction (exploded bucket list destinations)
target = visitor_pref_df_exploded['Bucket list destinations Sri Lanka']

# Join the ratings data from the places dataset
places_df['name'] = places_df['name'].str.strip("'")
places_ratings = places_df[['name', 'rating', 'user_ratings_total']]
combined_data = visitor_pref_df_exploded.merge(places_ratings, left_on='Bucket list destinations Sri Lanka', right_on='name', how='left')

# Drop rows where 'name' is NaN (because these rows do not have a corresponding place in places_df)
combined_data.dropna(subset=['name'], inplace=True)

# Fill NaN values: user_ratings_total -> 0, rating -> mean
combined_data['rating'].fillna(combined_data['rating'].mean(), inplace=True)  # Fill NaNs in rating with mean
combined_data['user_ratings_total'].fillna(0, inplace=True)  # Fill NaNs in user_ratings_total with 0

# Reset the index of combined_data to ensure alignment
combined_data.reset_index(drop=True, inplace=True)

# Standardize the ratings and rating counts so they are on the same scale as the TF-IDF vectors
scaler = StandardScaler()
combined_data[['rating', 'user_ratings_total']] = scaler.fit_transform(combined_data[['rating', 'user_ratings_total']])

# Retain the user names for later mapping
user_names = visitor_pref_df_exploded['Name']  # Assuming 'Name' column holds the user names

# Debugging step: Check for any remaining NaN values
print("Null values in combined_data after dropping NaNs:")
print(combined_data.isnull().sum())

# Convert sparse TF-IDF matrix to a dense format and ensure there are no NaN values in the dense matrix
activities_tfidf_dense = pd.DataFrame(activities_tfidf.toarray())  # Convert to dense and check for NaNs
activities_tfidf_dense.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities if they exist
activities_tfidf_dense.fillna(0, inplace=True)  # Fill any NaNs in the dense matrix

# Concatenate activities and place attributes (including scaled ratings and rating count)
X = pd.concat([activities_tfidf_dense, combined_data[['rating', 'user_ratings_total']].reset_index(drop=True)], axis=1)

# Convert all column names to strings to avoid the type mismatch error
X.columns = X.columns.astype(str)

# Check if X has any NaN values before continuing
print("Checking NaN values in X:")
print(X.isnull().sum())

# Assert no NaN values in the final X data
X.fillna(0, inplace=True)
assert X.isnull().sum().sum() == 0, "There are NaN values in X!"

# Step 2: Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, combined_data['Bucket list destinations Sri Lanka'])

# Resample user names to match X_resampled
user_names_resampled = np.repeat(user_names.values, np.ceil(len(X_resampled) / len(user_names)).astype(int))[:len(X_resampled)]

# Dimensionality reduction using Truncated SVD (more memory efficient than PCA)
svd = TruncatedSVD(n_components=100)  # Reduce dimensions to 100
X_reduced = svd.fit_transform(X_resampled)

# Train-Test split after applying SMOTE (80-20)
X_train, X_test, y_train, y_test, train_user_names, test_user_names = train_test_split(X_reduced, y_resampled, user_names_resampled, test_size=0.2, random_state=42)

# Step 3: Train a Random Forest model with class weights and L2 regularization (through `min_samples_split` and `min_samples_leaf`)
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42,
                                  min_samples_split=10,  # L2 regularization
                                  min_samples_leaf=5)    # L2 regularization
rf_model.fit(X_train, y_train)

# Step 4: Predictions and evaluation
y_prob = rf_model.predict_proba(X_test)

# Get the top 5 predictions
top_n_predictions = np.argsort(y_prob, axis=1)[:, -4:]  # 4 additional predictions

# Accuracy measurement
accuracy = accuracy_score(y_test, rf_model.predict(X_test))
classification_rep = classification_report(y_test, rf_model.predict(X_test))

# Output the accuracy and classification report
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

# Map the numerical indices back to place names
unique_destinations = np.array(rf_model.classes_)  # Get the destination names in the same order as model's predict_proba output

# Map the indices back to the corresponding destination names
top_n_destination_names = [[unique_destinations[i] for i in top_n_predictions[j]] for j in range(len(top_n_predictions))]

# Get the primary destination for the test users (assuming the first destination in the bucket list is the primary)
primary_destinations = visitor_pref_df_exploded.groupby('Name').first().loc[test_user_names, 'Bucket list destinations Sri Lanka']

# Get the full actual bucket list destinations for the test users
test_actual_bucket_list = visitor_pref_df_exploded.groupby('Name')['Bucket list destinations Sri Lanka'].apply(list).loc[test_user_names]

# Print the results, giving the primary destination and 4 generated by the model
for i, (user, primary_dest, actual_list, preds) in enumerate(zip(test_user_names, primary_destinations, test_actual_bucket_list, top_n_destination_names[:10])):
    final_predictions = [primary_dest] + [dest for dest in preds if dest != primary_dest][:4]  # Ensure primary is included
    match_in_top_5 = any(dest in final_predictions for dest in actual_list)  # Check if any match

    print(f"User: {user}")
    print(f"Primary destination: {primary_dest}")
    print(f"Final 5 destinations: {final_predictions}")
    print(f"Actual bucket list destinations: {actual_list}")
    print(f"Match in top-5 predictions: {match_in_top_5}")
    print("-" * 80)


Null values in combined_data after dropping NaNs:
User ID                               0
Name                                  0
Email                                 0
Preferred Activities                  0
Bucket list destinations Sri Lanka    0
Preferred Activities Str              0
name                                  0
rating                                0
user_ratings_total                    0
dtype: int64
Checking NaN values in X:
0                     26176
1                     26176
2                     26176
3                     26176
4                     26176
                      ...  
99                    26176
100                   26176
101                   26176
rating                    0
user_ratings_total        0
Length: 104, dtype: int64
Accuracy: 0.7992349294288352
Classification Report:
                                               precision    recall  f1-score   support

                                    Ahangama       0.78      0.28      0.41  

##Final model developed by Sonal using Random Forest method
Highly Accurate - input the activities to be done and generates a list of places

In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
user_data = pd.read_csv('/content/vistor_preference.csv')
places_data = pd.read_csv('/content/dataset.csv')

# Clean and prepare the user data
user_data['Preferred Activities Clean'] = user_data['Preferred Activities']\
    .str.strip('[]')\
    .str.replace("'", "", regex=False)\
    .str.replace(',', ' ', regex=False)

# Example input for preferred activities in the required format
preferred_activities_input = ['camping', 'hiking', 'safaris']

# Function to process preferred activities input
def process_user_preference(preferred_activities):
    # Convert list of activities into a single string
    return ' '.join(preferred_activities)

# Convert the list of activities into a string for vectorization
user_preference = [process_user_preference(preferred_activities_input)]

# Step 1: Vectorize the activities using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
activity_vectors = tfidf_vectorizer.fit_transform(user_data['Preferred Activities Clean'].fillna(''))

# Step 2: Create a user preference vector based on the provided input
user_preference_vector = tfidf_vectorizer.transform(user_preference)

# Step 3: Compute cosine similarity between user preferences and activities in the dataset
similarity_scores = cosine_similarity(user_preference_vector, activity_vectors).flatten()

# Step 4: Add similarity scores to the user data for ranking
user_data['Similarity Score'] = similarity_scores

# Step 5: Sort users based on similarity scores to find the best match for the user's input
top_similar_users = user_data.sort_values(by='Similarity Score', ascending=False).head(15)

# Step 6: Extract the top destinations from the users with the highest similarity scores
top_destinations = pd.Series(top_similar_users['Bucket list destinations Sri Lanka'].str.strip('[]').str.replace("'", "").str.split(', ')).explode().value_counts().head(10)

# Step 7: Format the output as a list of recommended places
recommended_places = top_destinations.index.tolist()
print(f'Original Recommended Places: {recommended_places}')

# Step 8: Prepare the place dataset for machine learning (clean and fill missing values)
places_data['rating'].fillna(0, inplace=True)
places_data['user_ratings_total'].fillna(0, inplace=True)
places_data['review_scale'].fillna(0, inplace=True)

# Create a label based on whether a place is in the recommended list (1 if recommended, 0 otherwise)
places_data['is_recommended'] = places_data['name'].apply(lambda x: 1 if x in recommended_places else 0)

# Step 9: Add the similarity score of the recommended places to the places_data
places_data['similarity_score'] = places_data['name'].apply(lambda x: user_data.loc[user_data['Bucket list destinations Sri Lanka'].str.contains(x, na=False, regex=False), 'Similarity Score'].max())

# Replace NaN similarity scores with 0 for places not found in the user data
places_data['similarity_score'].fillna(0, inplace=True)

# Step 10: Create a feature set including rating, user_ratings_total, review_scale, and similarity_score
X = places_data[['rating', 'user_ratings_total', 'review_scale', 'similarity_score']]
y = places_data['is_recommended']

# Step 11: Initialize and train the Random Forest Classifier using the entire dataset
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Step 12: Predict probabilities for all places
place_predictions = model.predict_proba(X)[:, 1]
places_data['predicted_probability'] = place_predictions

# Step 13: Sort places by the predicted probability
sorted_places = places_data.sort_values(by='predicted_probability', ascending=False)

# Step 14: Display the top 5 recommended places with their probabilities and factors
print("Top 5 Recommended Places with Predicted Probabilities and Factors:")
print(sorted_places[['name', 'rating', 'user_ratings_total', 'review_scale', 'similarity_score', 'predicted_probability']].head(5))

Original Recommended Places: ['Knuckles', 'Vaddha Village Camping', 'Meemure', 'Horton Plains', 'Yala National Park', 'Wilpattu National Park', 'Horton Plains National Park', '"Sri Pada / Adams Peak"', 'Bentota River', 'Belihuloya']
Top 5 Recommended Places with Predicted Probabilities and Factors:
                           name  rating  user_ratings_total  review_scale  \
69                   Belihuloya     0.0                 0.0             6   
68                      Meemure     0.0                 0.0             6   
13       Wilpattu National Park     4.7              2708.0             7   
28                     Knuckles     4.9               337.0             7   
17  Horton Plains National Park     4.7              8564.0             7   

    similarity_score  predicted_probability  
69          0.639885                   0.83  
68          0.642868                   0.80  
13          1.000000                   0.79  
28          1.000000                   0.73  
17     