In [3]:
#IMPROVING THE MODEL PERFORMANCE AND ANALYSIS
#LOAD THE PREPROCESSED DATA
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('D:\PardivReddy_Cognifyz\content/preprocessed_data.csv')

X = data.drop('Aggregate rating', axis=1)
y = data['Aggregate rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data loaded and split. X_train shape:", X_train.shape)

Data loaded and split. X_train shape: (7633, 20)


In [4]:
#TRAIN AND IMPROVING MODEL
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Evaluation:")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Model Evaluation:
Mean Squared Error: 0.027794871136720793
R-squared Score: 0.9878626723576668


In [5]:
#FEATURE ANALYSIS
importances = model.feature_importances_
feature_names = X.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Feature Importance:
                 Feature  Importance
19                 Votes    0.899632
17          Rating color    0.083550
18           Rating text    0.005334
0          Restaurant ID    0.002279
7              Longitude    0.001365
1        Restaurant Name    0.001287
4                Address    0.001270
9               Cuisines    0.001257
8               Latitude    0.001201
10  Average Cost for two    0.000876
5               Locality    0.000591
6       Locality Verbose    0.000574
13   Has Online delivery    0.000318
3                   City    0.000188
16           Price range    0.000141
12     Has Table booking    0.000070
14     Is delivering now    0.000030
11              Currency    0.000023
2           Country Code    0.000017
15  Switch to order menu    0.000000

Top 10 Most Important Features:
                 Feature  Importance
19                 Votes    0.899632
17          Rating color    0.083550
18           Rating text    0.005334
0          Restaurant

In [6]:
#SAVE RESULTS AND MODEL
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print("\nCross-Validation R-squared Scores:", cv_scores)
print("Average CV R-squared:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())


Cross-Validation R-squared Scores: [0.9852055  0.98486888 0.98808431 0.98711139 0.98888778]
Average CV R-squared: 0.9868315719916888
Standard Deviation: 0.0015730158306778092


In [8]:
import joblib
joblib.dump(model, 'D:\PardivReddy_Cognifyz\content/Restaurant_rating_improved_model.pkl')
print("Improved model saved as 'restaurant_rating_model_improved.pkl'.")

with open('D:\PardivReddy_Cognifyz\content/task2_progress.txt', 'w') as f:
    f.write("Task 2 Completion Report ")
    f.write(f"Mean Squared Error: {mse}\n")
    f.write(f"R-squared Score: {r2}\n")
    f.write("Used RandomForestRegressor and analyzed feature importance.\n")
print("Progress saved to 'task2_progress.txt'.")

Improved model saved as 'restaurant_rating_model_improved.pkl'.
Progress saved to 'task2_progress.txt'.
