In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [3]:
# Load the dataset 
data = pd.read_csv(r"C:\Users\sande\OneDrive\Desktop\data science\Cognifyz ML internship\Dataset .csv")

In [5]:
# Drop irrelevant columns
columns_to_drop = ['Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 'Locality Verbose', 'Rating color', 'Rating text']
data = data.drop(columns=columns_to_drop)

In [7]:
# Handle missing values
data['Cuisines'].fillna(data['Cuisines'].mode()[0], inplace=True)

In [9]:
# Encode categorical variables
label_enc = LabelEncoder()
data['City'] = label_enc.fit_transform(data['City'])
data['Cuisines'] = label_enc.fit_transform(data['Cuisines'])

In [11]:
# One-hot encode smaller categorical features
categorical_features = ['Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [13]:
# Define features and target
X = data.drop(columns=['Aggregate rating'])
y = data['Aggregate rating']

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Train a linear regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [19]:
# Predict on the test set
y_pred = regressor.predict(X_test)

In [23]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [25]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 1.5747957321568016
R-squared: 0.3081193656096952


In [27]:
# Feature importance analysis
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': regressor.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance:")
print(coefficients)


Feature Importance:
                            Feature   Coefficient
8       Currency_Brazilian Real(R$)  2.166799e+00
9                Currency_Dollar($)  1.062476e+00
11      Currency_Indian Rupees(Rs.)  8.282129e-01
20          Has Online delivery_Yes  7.418221e-01
12  Currency_Indonesian Rupiah(IDR)  5.975659e-01
6                       Price range  5.176688e-01
14              Currency_Pounds(��)  3.812660e-01
18        Currency_Turkish Lira(TL)  2.702895e-01
16                 Currency_Rand(R)  1.707987e-01
19            Has Table booking_Yes  1.476317e-02
0                      Country Code  9.893851e-03
2                         Longitude  7.097821e-03
3                          Latitude  1.579175e-03
7                             Votes  5.811799e-04
5              Average Cost for two  1.767351e-07
4                          Cuisines -1.156033e-04
1                              City -4.009516e-03
15         Currency_Qatari Rial(QR) -6.205527e-02
21            Is delivering n