## Task of Predictive Modeling Level 3, Task 1

In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
dataset = pd.read_csv('Dataset.csv')

# Display the first few rows of the dataset
print(dataset.head())


   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, Ortigas, Mandaluyong City   
3      SM 

In [10]:
# Drop non-numeric columns and the target variable
X = dataset.drop(['Aggregate rating', 'Cuisines'], axis=1)

# Target variable
y = dataset['Aggregate rating']


In [11]:
# Identify missing values
missing_values = X.isnull().sum()

# Display missing values
print("Missing Values:\n", missing_values)

# Handle missing values using SimpleImputer
# For simplicity, let's fill missing values with the mean for numeric columns
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
numeric_transformer = SimpleImputer(strategy='mean')

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Apply the preprocessor to the features
X_preprocessed = preprocessor.fit_transform(X)

# Display the preprocessed features
print("\nDataset after handling missing values:\n", pd.DataFrame(X_preprocessed, columns=X.columns))


Missing Values:
 Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Rating color            0
Rating text             0
Votes                   0
dtype: int64


ValueError: Shape of passed values is (9551, 7), indices imply (9551, 19)

In [12]:
# Choose a single categorical column for one-hot encoding (e.g., 'City')
categorical_column = 'City'

# Create a OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Fit and transform the categorical column
X_encoded_column = encoder.fit_transform(X[[categorical_column]])

# Display the shape of the encoded column
print(f"\nEncoded {categorical_column} column:", X_encoded_column.shape)



Encoded City column: (9551, 141)




In [13]:
# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set - Features:", X_train.shape)
print("Testing set - Features:", X_test.shape)
print("Training set - Target:", y_train.shape)
print("Testing set - Target:", y_test.shape)


Training set - Features: (7640, 7)
Testing set - Features: (1911, 7)
Training set - Target: (7640,)
Testing set - Target: (1911,)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance using appropriate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 1.5759038064948894
R-squared: 0.30763253727994244


In [15]:
from sklearn.tree import DecisionTreeRegressor

# Initialize Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predict on the testing set
dt_y_pred = dt_model.predict(X_test)

# Evaluate the model's performance using appropriate metrics
dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_r2 = r2_score(y_test, dt_y_pred)

print("Decision Tree - Mean Squared Error:", dt_mse)
print("Decision Tree - R-squared:", dt_r2)


Decision Tree - Mean Squared Error: 0.15012558869701725
Decision Tree - R-squared: 0.9340428822450173


In [16]:
from sklearn.ensemble import RandomForestRegressor

# Initialize Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the testing set
rf_y_pred = rf_model.predict(X_test)

# Evaluate the model's performance using appropriate metrics
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print("Random Forest - Mean Squared Error:", rf_mse)
print("Random Forest - R-squared:", rf_r2)


Random Forest - Mean Squared Error: 0.07602709052851911
Random Forest - R-squared: 0.9665977811905301
