<a href="https://colab.research.google.com/github/SisirBhargav/Zeta-aiml/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/restaurant_recommendation_dataset.csv"
df = pd.read_csv(file_path)

# Drop irrelevant columns
if "user_id" in df.columns:
    df = df.drop(columns=["user_id"])

# Define target variable
target_column = "rating"
X = df.drop(columns=[target_column])
y = df[target_column]

# Normalize ratings
y = MinMaxScaler().fit_transform(y.values.reshape(-1, 1)).flatten()

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

# Drop highly correlated features (> 0.9 correlation)
thresh = 0.9
corr_matrix = df[numerical_cols].corr()
correlated_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > thresh:
            correlated_features.add(corr_matrix.columns[i])
X = X.drop(columns=correlated_features, errors='ignore')
numerical_cols = list(set(numerical_cols) - correlated_features)

# Standard Scaling for numerical columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# One-hot encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)], remainder="passthrough")
X = preprocessor.fit_transform(X)

# Feature Selection using RFE
estimator = RandomForestRegressor()
rfe = RFE(estimator, n_features_to_select=10)
X = rfe.fit_transform(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models with Hyperparameter Tuning (Reduced Training Time)
models = {
    "Decision Tree": GridSearchCV(DecisionTreeRegressor(), param_grid={'max_depth': [5, 10]}, cv=3),
    "Random Forest": GridSearchCV(RandomForestRegressor(), param_grid={'n_estimators': [50, 100], 'max_depth': [5, 10]}, cv=3)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    best_model = model.best_estimator_
    y_pred = best_model.predict(X_test)
    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R²": r2_score(y_test, y_pred),
        "Best Params": model.best_params_
    }
    joblib.dump(best_model, f"{name.replace(' ', '_')}_model.pkl")

# Print results
print("\nModel Performance:")
for name, metrics in results.items():
    print(f"{name} -> MAE: {metrics['MAE']:.4f}, MSE: {metrics['MSE']:.4f}, R²: {metrics['R²']:.4f}")
    print(f"Best Parameters: {metrics['Best Params']}")


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/restaurant_recommendation_dataset.csv"
df = pd.read_csv(file_path)
df['restaurant_name']
# Select only numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns
df_numerical = df[numerical_cols]

# Compute correlation matrix
correlation_matrix = df_numerical.corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()
