In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

excel_file = 'final_tourism_dataset.xlsx'

# Load the first sheet (or specify the sheet name)
df = pd.read_excel(excel_file, sheet_name=0)

# Save as CSV
df.to_csv('Final_tourism.csv', index=False)


In [7]:
# Drop rows with missing target
df.dropna(subset=['Rating'], inplace=True)

# Fill missing values in important categorical fields
df.fillna({
    'VisitMode': 'Unknown',
    'Continent': 'Unknown',
    'Region': 'Unknown',
    'Country': 'Unknown',
    'CityName': 'Unknown',
    'AttractionType': 'Unknown',
    'Rating': df['Rating'].mean()
}, inplace=True)




In [9]:
# Columns to label encode
categorical_cols = ['VisitMode', 'Continent', 'Region', 'Country', 'CityName', 'AttractionType']

# Dictionary to store encoders (useful for Streamlit or deployment)
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le


In [13]:
# Feature set based on your columns
features = [
    'VisitYear', 'VisitMonth', 'ContinentId', 'RegionId', 'CountryId', 'CityId',
    'AttractionCityId', 'AttractionTypeId', 'VisitMode', 'Continent', 'Region',
    'Country', 'CityName', 'AttractionType'
]

# Target variable
X = df[features]
y = df['Rating']


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)


In [17]:
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


RMSE: 0.92
R² Score: 0.10




In [19]:
import joblib

joblib.dump(model, 'rating_predictor_model.pkl')
joblib.dump(encoders, 'label_encoders.pkl')


['label_encoders.pkl']

In [32]:
# Encode the target variable VisitMode
visit_mode_encoder = LabelEncoder()
df['VisitModeEncoded'] = visit_mode_encoder.fit_transform(df['VisitMode'])

# === Step 5: Select Features and Target ===
features = [
    'VisitYear', 'VisitMonth',
    'Continent', 'Region', 'Country', 'CityName', 'AttractionType'
]

X = df[features]
y = df['VisitModeEncoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=150, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [33]:
# Decode predicted and actual visit modes
y_pred_labels = visit_mode_encoder.inverse_transform(y_pred)
y_test_labels = visit_mode_encoder.inverse_transform(y_test)

print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.18      0.24       130
           1       0.54      0.64      0.58      4290
           2       0.50      0.49      0.50      3079
           3       0.37      0.31      0.34      2171
           4       0.39      0.22      0.28       916

    accuracy                           0.49     10586
   macro avg       0.43      0.37      0.39     10586
weighted avg       0.48      0.49      0.48     10586

Accuracy: 0.49


In [34]:
import joblib

joblib.dump(clf, "visit_mode_classifier.pkl")
joblib.dump(visit_mode_encoder, "visit_mode_encoder.pkl")
joblib.dump(encoders, "feature_encoders.pkl")

['feature_encoders.pkl']

In [90]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# === Load and Clean Data ===
df = pd.read_csv("Final_tourism.csv")
df = df[['UserId', 'AttractionId', 'Rating', 'AttractionTypeId', 'CityId', 'CountryId']].dropna()

df['UserId'] = df['UserId'].astype(str)
df['AttractionId'] = df['AttractionId'].astype(str)

# === Collaborative Filtering ===
def collaborative_filtering(df, user_id, top_n=5):
    user_item_matrix = df.pivot_table(index='UserId', columns='AttractionId', values='Rating').fillna(0)
    if user_id not in user_item_matrix.index:
        return []

    similarity = cosine_similarity(user_item_matrix)
    similarity_df = pd.DataFrame(similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

    similar_users = similarity_df[user_id].drop(user_id).sort_values(ascending=False)
    top_users = similar_users.head(10).index
    weighted_ratings = user_item_matrix.loc[top_users].T.dot(similar_users[top_users])
    sim_sum = similar_users[top_users].sum()
    recommendation_scores = weighted_ratings / (sim_sum + 1e-9)

    seen = user_item_matrix.loc[user_id]
    unseen = recommendation_scores[seen[seen == 0].index]

    return unseen.sort_values(ascending=False).head(top_n).items()

# === Content-Based Filtering ===
def content_based_filtering(df, user_id, top_n=5):
    # Attraction features
    attraction_features = df.drop_duplicates('AttractionId')[
        ['AttractionId', 'AttractionTypeId', 'CityId', 'CountryId']
    ].set_index('AttractionId')

    scaler = StandardScaler()
    attraction_scaled = scaler.fit_transform(attraction_features)
    attraction_similarity = pd.DataFrame(
        cosine_similarity(attraction_scaled),
        index=attraction_features.index,
        columns=attraction_features.index
    )

    user_data = df[df['UserId'] == user_id]
    if user_data.empty:
        return []

    top_rated = user_data.sort_values(by='Rating', ascending=False).iloc[0]['AttractionId']
    sim_scores = attraction_similarity[top_rated].sort_values(ascending=False)
    sim_scores = sim_scores.drop(labels=user_data['AttractionId'].values, errors='ignore')

    return list(sim_scores.head(top_n).items())

# === Unified Test Function ===
def recommend_all(user_id):
    print(f"📌 Recommendations for User {user_id}\n")

    print("🔷 Collaborative Filtering Recommendations:")
    cf_recs = collaborative_filtering(df, user_id)
    for a, s in cf_recs:
        print(f"Attraction {a}, Predicted Rating: {s:.2f}")

    print("\n🔶 Content-Based Filtering Recommendations:")
    cb_recs = content_based_filtering(df, user_id)
    for a, s in cb_recs:
        print(f"Attraction {a}, Similarity Score: {s:.2f}")

if __name__ == "__main__":
    recommend_all("20977")


📌 Recommendations for User 20977

🔷 Collaborative Filtering Recommendations:
Attraction 1133, Predicted Rating: 0.00
Attraction 748, Predicted Rating: 0.00
Attraction 949, Predicted Rating: 0.00
Attraction 947, Predicted Rating: 0.00
Attraction 937, Predicted Rating: 0.00

🔶 Content-Based Filtering Recommendations:
Attraction 949, Similarity Score: 0.76
Attraction 737, Similarity Score: 0.73
Attraction 749, Similarity Score: 0.68
Attraction 748, Similarity Score: 0.67
Attraction 1171, Similarity Score: 0.66
