In [3]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
import pymongo
import joblib

# Load the dataset
df_turfs = pd.read_csv("D:\\Academics\\SEM 5\\DAV\\Model\\synthetic_turf_data_200.csv")

# Step 1: Sentiment Analysis
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

df_turfs['description_sentiment'] = df_turfs['description'].apply(get_sentiment)
df_turfs['comments_sentiment'] = df_turfs['comments'].apply(get_sentiment)

# Step 2: TF-IDF Vectorization
tfidf_desc = TfidfVectorizer(stop_words='english')
tfidf_amen = TfidfVectorizer(stop_words='english')
tfidf_comments = TfidfVectorizer(stop_words='english')

description_matrix = tfidf_desc.fit_transform(df_turfs['description']).toarray()
amenities_matrix = tfidf_amen.fit_transform(df_turfs['amenities'].apply(lambda x: x.strip("[]").replace("'", "").replace(" ", ""))).toarray()
comments_matrix = tfidf_comments.fit_transform(df_turfs['comments']).toarray()

# Step 3: Feature Normalization
scaler = MinMaxScaler()
numerical_features = df_turfs[['booking_count', 'likes', 'dislikes', 'price', 'averageRating']].values
df_turfs[['booking_norm', 'likes_norm', 'dislikes_norm', 'price_norm', 'rating_norm']] = scaler.fit_transform(numerical_features)

# Step 4: Combine All Features
combined_features = np.hstack([
    amenities_matrix, 
    description_matrix, 
    comments_matrix, 
    df_turfs[['booking_norm', 'likes_norm', 'dislikes_norm', 'price_norm', 'rating_norm', 'description_sentiment', 'comments_sentiment']].values
])

# Step 5: Define Target Variable and Train-Test Split
df_turfs['composite_score'] = (
    0.3 * df_turfs['averageRating'] +
    0.2 * df_turfs['booking_count'] +
    0.2 * df_turfs['likes'] - 
    0.1 * df_turfs['dislikes'] +
    0.2 * df_turfs['price']
)

X_train, X_test, y_train, y_test = train_test_split(combined_features, df_turfs['composite_score'], test_size=0.2, random_state=42)

# Step 6: Train and Save Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(tfidf_desc, 'tfidf_desc.pkl')
joblib.dump(tfidf_amen, 'tfidf_amen.pkl')
joblib.dump(tfidf_comments, 'tfidf_comments.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Prediction Function
def predict_score(description, amenities, comments, averageRating, booking_count, likes, dislikes, price):
    desc_vect = tfidf_desc.transform([description]).toarray()
    amen_vect = tfidf_amen.transform([amenities]).toarray()
    comments_vect = tfidf_comments.transform([comments]).toarray()
    
    desc_sentiment = get_sentiment(description)
    comments_sentiment = get_sentiment(comments)
    
    scaled_features = scaler.transform([[booking_count, likes, dislikes, price, averageRating]])
    booking_norm, likes_norm, dislikes_norm, price_norm, rating_norm = scaled_features[0]
    
    input_features = np.hstack([
        amen_vect[0], 
        desc_vect[0],
        comments_vect[0],
        [booking_norm, likes_norm, dislikes_norm, price_norm, rating_norm, desc_sentiment, comments_sentiment]
    ]).reshape(1, -1)
    
    return model.predict(input_features)[0]

# MongoDB Integration
client = pymongo.MongoClient("mongodb+srv://Sanjeevi555pn:Sanjeevi@cluster0.vq9y9.mongodb.net/")
db = client["turf"]
turf_collection = db["turves"]

# Predict Scores for All Turfs
predicted_scores = []
turfs = turf_collection.find()

for turf in turfs:
    description = turf.get("description", "")
    amenities = ", ".join(turf.get("amenities", []))
    comments = ""  # Assuming you fetch reviews separately for comments
    averageRating = turf.get("averageRating", 0)
    booking_count = turf.get("booking_count", 0)
    likes = turf.get("likes", 0)
    dislikes = turf.get("dislikes", 0)
    price = turf.get("pricePerHour", 0)
    
    predicted_score = predict_score(description, amenities, comments, averageRating, booking_count, likes, dislikes, price)
    predicted_scores.append({"turf_id": turf["_id"], "predicted_score": predicted_score})

top_turfs = sorted(predicted_scores, key=lambda x: x['predicted_score'], reverse=True)[:5]

# Print Top Turfs
for i, turf in enumerate(top_turfs):
    print(f"Rank {i+1}: Turf ID: {turf['turf_id']}, Predicted Score: {turf['predicted_score']}")


ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\sanjeevi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\externals\__init__.py)