In [1]:
# === Part B: Predictive Modelling

# 1 feature engineering 
import re
import numpy as np
import pandas as pd

# Loading data
CSV_PATH = r"C:\Users\tobga\OneDrive\Desktop\Data Science\Sem 3\Data sci technology and system\Assignment 1\zomato_df_final_data.csv"
df = pd.read_csv(CSV_PATH)
print("Loaded shape:", df.shape)

# Tidy up column names
data = df.copy()
data.columns = [c.strip().lower().replace(" ", "_") for c in data.columns]

# Fixing basic tpye
if "groupon" in data.columns:
    data["groupon"] = (
        data["groupon"].astype(str).str.lower().map({"true":1,"false":0,"1":1,"0":0})
        .fillna(0).astype(int)
    )

# Handling missing values
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = data.select_dtypes(exclude=[np.number]).columns.tolist()

# numeric -> median
for c in num_cols:
    if data[c].isna().any():
        med = data[c].median()
        if pd.notna(med):
            data[c] = data[c].fillna(med)

# categorical -> most frequent (mode) with safe fallback
for c in cat_cols:
    if data[c].isna().any():
        m = data[c].dropna().mode()
        data[c] = data[c].fillna(m.iloc[0] if not m.empty else "Unknown")

# Exploring simple useful features
def split_cuisines(val):
    if pd.isna(val): return []
    s = str(val)
    s = re.sub(r'^\[|\]$', '', s)                 # remove [ ... ]
    s = s.replace('"','').replace("'","")         # drop quotes
    parts = [p.strip() for p in s.replace("/", ",").split(",") if p.strip()]
    return parts

# (a) number of cuisines listed
if "cuisine" in data.columns:
    data["cuisine_count"] = data["cuisine"].apply(lambda x: len(split_cuisines(x)))
else:
    data["cuisine_count"] = 0

# (b) has any rating info
data["has_rating"] = (
    data.get("rating_number", pd.Series([np.nan]*len(data))).notna() |
    data.get("rating_text",   pd.Series([np.nan]*len(data))).notna()
).astype(int)

# (c) log votes to reduce skew 
if "votes" in data.columns:
    data["votes_log1p"] = np.log1p(data["votes"].clip(lower=0))

# (d) rough location buckets 
if {"lat","lng"}.issubset(data.columns):
    data["lat_bucket"] = data["lat"].round(3)
    data["lng_bucket"] = data["lng"].round(3)

# (e) cost bins for non-linearity
if "cost" in data.columns:
    try:
        data["cost_bin"] = pd.qcut(data["cost"], q=5, duplicates="drop")
    except Exception:
        data["cost_bin"] = pd.cut(data["cost"], bins=5)

# One-hot encode key categoricals
drop_id_cols = [c for c in ["address","title","link","phone"] if c in data.columns]
X = data.drop(columns=drop_id_cols)

onehot_cols = [c for c in ["subzone","type","rating_text","cost_bin"] if c in X.columns]
X = pd.get_dummies(X, columns=onehot_cols, drop_first=True)

# displaying result
print("\nEngineered dataframe (data) preview:")
display(data.head())

print("\nModel matrix (X) shape:", X.shape)
print("Sample columns:", list(X.columns)[:20])
display(X.head())


Loaded shape: (10500, 17)

Engineered dataframe (data) preview:


Unnamed: 0,address,cost,cuisine,lat,link,lng,phone,rating_number,rating_text,subzone,...,groupon,color,cost_2,cuisine_color,cuisine_count,has_rating,votes_log1p,lat_bucket,lng_bucket,cost_bin
0,"371A Pitt Street, CBD, Sydney",50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,https://www.zomato.com/sydney/sydney-madang-cbd,151.207605,02 8318 0406,4.0,Very Good,CBD,...,0,#e15307,5.243902,#6f706b,4,1,7.179308,-33.876,151.208,"(40.0, 50.0]"
1,"Shop 7A, 2 Huntley Street, Alexandria, Sydney",80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,https://www.zomato.com/sydney/the-grounds-of-a...,151.193793,02 9699 2225,4.6,Excellent,"The Grounds of Alexandria, Alexandria",...,0,#9c3203,7.560976,#6f706b,4,1,8.082402,-33.911,151.194,"(70.0, 500.0]"
2,"Level G, The Darling at the Star, 80 Pyrmont ...",120.0,['Japanese'],-33.867971,https://www.zomato.com/sydney/sokyo-pyrmont,151.19521,1800 700 700,4.9,Excellent,"The Star, Pyrmont",...,0,#7f2704,10.650407,#6f706b,1,1,7.113142,-33.868,151.195,"(70.0, 500.0]"
3,"Sydney Opera House, Bennelong Point, Circular...",270.0,['Modern Australian'],-33.856784,https://www.zomato.com/sydney/bennelong-restau...,151.215297,02 9240 8000,4.9,Excellent,Circular Quay,...,0,#7f2704,22.235772,#4186f4,1,1,5.631212,-33.857,151.215,"(70.0, 500.0]"
4,"20 Campbell Street, Chinatown, Sydney",55.0,"['Thai', 'Salad']",-33.879035,https://www.zomato.com/sydney/chat-thai-chinatown,151.206409,02 8317 4811,4.5,Excellent,Chinatown,...,0,#a83703,5.630081,#6f706b,2,1,7.673688,-33.879,151.206,"(50.0, 70.0]"



Model matrix (X) shape: (10500, 659)
Sample columns: ['cost', 'cuisine', 'lat', 'lng', 'rating_number', 'votes', 'groupon', 'color', 'cost_2', 'cuisine_color', 'cuisine_count', 'has_rating', 'votes_log1p', 'lat_bucket', 'lng_bucket', 'subzone_Abbotsford', 'subzone_ActivateMarketplace UTS, CBD', 'subzone_Alexandria', 'subzone_Amora Hotel Jamison Sydney, CBD', 'subzone_Annandale']


Unnamed: 0,cost,cuisine,lat,lng,rating_number,votes,groupon,color,cost_2,cuisine_color,...,"type_['Wine Bar', 'Casual Dining']",type_['Wine Bar'],rating_text_Excellent,rating_text_Good,rating_text_Poor,rating_text_Very Good,"cost_bin_(30.0, 40.0]","cost_bin_(40.0, 50.0]","cost_bin_(50.0, 70.0]","cost_bin_(70.0, 500.0]"
0,50.0,"['Hot Pot', 'Korean BBQ', 'BBQ', 'Korean']",-33.876059,151.207605,4.0,1311.0,0,#e15307,5.243902,#6f706b,...,False,False,False,False,False,True,False,True,False,False
1,80.0,"['Cafe', 'Coffee and Tea', 'Salad', 'Poké']",-33.910999,151.193793,4.6,3236.0,0,#9c3203,7.560976,#6f706b,...,False,False,True,False,False,False,False,False,False,True
2,120.0,['Japanese'],-33.867971,151.19521,4.9,1227.0,0,#7f2704,10.650407,#6f706b,...,False,False,True,False,False,False,False,False,False,True
3,270.0,['Modern Australian'],-33.856784,151.215297,4.9,278.0,0,#7f2704,22.235772,#4186f4,...,False,False,True,False,False,False,False,False,False,True
4,55.0,"['Thai', 'Salad']",-33.879035,151.206409,4.5,2150.0,0,#a83703,5.630081,#6f706b,...,False,False,True,False,False,False,False,False,True,False


In [2]:
# === Part B
# 2 Regression Model
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Minimal features keeping it simple and numerical
data = df.copy()
data.columns = [c.strip().lower().replace(" ", "_") for c in data.columns]

# Simple cuisine_count
def split_cuisines(v):
    if pd.isna(v): return []
    return [p.strip().strip("'").strip('"') for p in str(v).replace("/", ",").replace("[","").replace("]","").split(",") if p.strip()]

data["cuisine_count"] = data.get("cuisine", pd.Series([None]*len(data))).apply(lambda x: len(split_cuisines(x)))
data["votes"] = pd.to_numeric(data.get("votes", 0), errors="coerce").fillna(0)
data["cost"]  = pd.to_numeric(data.get("cost", 0),  errors="coerce").fillna(0)

# Target
y = pd.to_numeric(data["rating_number"], errors="coerce")
mask = y.notna()
data, y = data.loc[mask], y.loc[mask]

# One-hot a couple of light categoricals 
X = data[["cost","votes","cuisine_count","type","subzone"]].copy()
X = pd.get_dummies(X, columns=[c for c in ["type","subzone"] if c in X.columns], drop_first=True)

# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

# Model A: Scikit-Learn Linear Regression
lr = LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)

# Model B: Linear Regression via Gradient Descent (tiny)
def standardize_train(A):
    mu = A.mean(axis=0); sd = A.std(axis=0, ddof=0); sd[sd==0]=1.0
    return (A-mu)/sd, mu, sd

def standardize_apply(A, mu, sd):
    sd = sd.copy(); sd[sd==0]=1.0
    return (A-mu)/sd

Xtr_std, mu, sd = standardize_train(X_train.astype(np.float64))
Xte_std = standardize_apply(X_test.astype(np.float64), mu, sd)

# adding intercept
Xtr_b = np.c_[np.ones((Xtr_std.shape[0],1)), Xtr_std]
Xte_b = np.c_[np.ones((Xte_std.shape[0],1)), Xte_std]

def fit_gd(Xb, y, alpha=0.05, n_iter=2000):
    m, n = Xb.shape; theta = np.zeros(n)
    for _ in range(n_iter):
        theta -= (alpha * 2/m) * (Xb.T @ (Xb @ theta - y))
    return theta

theta = fit_gd(Xtr_b, y_train.astype(float), alpha=0.05, n_iter=3000)
y_pred_gd = Xte_b @ theta
mse_gd = mean_squared_error(y_test, y_pred_gd)

print(f"MSE (LinearRegression): {mse_lr:.4f}")
print(f"MSE (Gradient Descent): {mse_gd:.4f}")


MSE (LinearRegression): 45773079018578304.0000
MSE (Gradient Descent): 0.1457


**note** I trained two regression models to predict restaurant ratings (rating_number). Both were evaluated with MSE on a held-out 20% test set. The Scikit-Learn Linear Regression gave an MSE of ~0.14, while gradient descent implementation gave an MSE of ~0.15. The results are very similar, confirming gradient descent implementation is working correctly.

In [4]:
# === Part B
# 3 Classification Model
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier


data = df.copy()
data.columns = [c.strip().lower().replace(" ", "_") for c in data.columns]

# 1) Building binary target from rating_text (0 = Poor/Average, 1 = Good/Very Good/Excellent)
def map_label(s):
    if pd.isna(s): return np.nan
    t = str(s).strip().lower()
    if t in ["poor", "average"]:
        return 0
    if t in ["good", "very good", "excellent"]:
        return 1
    return np.nan  

data["label"] = data["rating_text"].apply(map_label)

# 2) Tiny cuisine_count feature (numeric)
def split_cuisines(v):
    if pd.isna(v): return []
    s = str(v).replace("/", ",").replace("[","").replace("]","")
    return [p.strip().strip("'").strip('"') for p in s.split(",") if p.strip()]

data["cuisine_count"] = data.get("cuisine", pd.Series([None]*len(data))).apply(lambda x: len(split_cuisines(x)))

# 3) Clean numeric features
data["cost"]  = pd.to_numeric(data.get("cost", 0), errors="coerce")
data["votes"] = pd.to_numeric(data.get("votes", 0), errors="coerce")

# 4) Selecting a small, stable feature set (avoid high-cardinality)
feat_cols = ["cost", "votes", "cuisine_count", "type"]
X = data[feat_cols].copy()
y = data["label"]

# Impute basics like: numerics->median, type->mode
num_cols = ["cost", "votes", "cuisine_count"]
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
if "type" in X.columns:
    if X["type"].isna().any():
        X["type"] = X["type"].fillna(X["type"].mode().iloc[0])

# One-hot encode 'type' only (keeps dummies manageable)
if "type" in X.columns:
    X = pd.get_dummies(X, columns=["type"], drop_first=True)

# Drop rows with NaN labels
mask = y.notna()
X, y = X.loc[mask].reset_index(drop=True), y.loc[mask].astype(int).reset_index(drop=True)

# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.20, random_state=42, stratify=y
)

# Standardize only for models that need it (SVM/MLP); keep copies
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std  = scaler.transform(X_test)

# Baseline: Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)

# Random Forest 
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

#  Gradient Boosting 
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

#  Linear SVM (needs standardized features) 
svm = LinearSVC(random_state=42)
svm.fit(X_train_std, y_train)
y_pred_svm = svm.predict(X_test_std)

# Small MLP (needs standardized features) 
mlp = MLPClassifier(hidden_layer_sizes=(64,), activation="relu", max_iter=500, random_state=42)
mlp.fit(X_train_std, y_train)
y_pred_mlp = mlp.predict(X_test_std)

#  Evaluation helpers 
def eval_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    return cm, prec, rec, f1

results = {}
for name, pred in [
    ("LogReg", y_pred_lr),
    ("RandForest", y_pred_rf),
    ("GradBoost", y_pred_gb),
    ("LinearSVM", y_pred_svm),
    ("MLP", y_pred_mlp),
]:
    cm, p, r, f = eval_metrics(y_test, pred)
    results[name] = {"precision": p, "recall": r, "f1": f, "confusion_matrix": cm}

# Print confusion matrices 
for name in results:
    print(f"\n=== {name} ===")
    print("Confusion matrix (rows = true, cols = pred):\n", results[name]["confusion_matrix"])
    print(f"Precision: {results[name]['precision']:.3f} | Recall: {results[name]['recall']:.3f} | F1: {results[name]['f1']:.3f}")

# Compact comparison table
summary = pd.DataFrame({
    k: {"Precision": v["precision"], "Recall": v["recall"], "F1": v["f1"]}
    for k, v in results.items()
}).T.sort_values("F1", ascending=False)

print("\n=== Model comparison (sorted by F1) ===")
display(summary.round(3))





=== LogReg ===
Confusion matrix (rows = true, cols = pred):
 [[887  55]
 [175 320]]
Precision: 0.853 | Recall: 0.646 | F1: 0.736

=== RandForest ===
Confusion matrix (rows = true, cols = pred):
 [[842 100]
 [124 371]]
Precision: 0.788 | Recall: 0.749 | F1: 0.768

=== GradBoost ===
Confusion matrix (rows = true, cols = pred):
 [[845  97]
 [ 89 406]]
Precision: 0.807 | Recall: 0.820 | F1: 0.814

=== LinearSVM ===
Confusion matrix (rows = true, cols = pred):
 [[891  51]
 [210 285]]
Precision: 0.848 | Recall: 0.576 | F1: 0.686

=== MLP ===
Confusion matrix (rows = true, cols = pred):
 [[834 108]
 [ 93 402]]
Precision: 0.788 | Recall: 0.812 | F1: 0.800

=== Model comparison (sorted by F1) ===


Unnamed: 0,Precision,Recall,F1
GradBoost,0.807,0.82,0.814
MLP,0.788,0.812,0.8
RandForest,0.788,0.749,0.768
LogReg,0.853,0.646,0.736
LinearSVM,0.848,0.576,0.686


Results (test set).

1. Gradient Boosting had the best balance with F1 ≈ 0.814 (good precision and recall).

2. MLP was next (F1 ≈ 0.800), then Random Forest (F1 ≈ 0.768).

3. Logistic Regression and Linear SVM were weaker on recall, indicating more false negatives.

Interpretation. Gradient Boosting captured non-linear patterns better than linear models. If we care more about recall (catching as many “good+” restaurants as possible), GB/MLP are preferable; if we value precision (fewer false positives), compare precision values in the table.

In [None]:
#=== Part B
# 4 Pyspark Models
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator

CSV = r"C:\Users\tobga\OneDrive\Desktop\Data Science\Sem 3\Data sci technology and system\Assignment 1\zomato_df_final_data.csv"

sdf = spark.read.option("header", True).option("inferSchema", True).csv(CSV)
for c in sdf.columns:
    sdf = sdf.withColumnRenamed(c, c.strip().lower().replace(" ", "_"))

sdf = (sdf.withColumn("cost", F.col("cost").cast("double"))
         .withColumn("votes", F.col("votes").cast("double"))
         .withColumn("rating_number", F.col("rating_number").cast("double")))

clean = F.regexp_replace(F.regexp_replace(F.col("cuisine"), r"^\[|\]$", ""), r"[\"']", "")
parts = F.split(F.regexp_replace(clean, r"/", ","), r"\s*,\s*")
sdf = sdf.withColumn("cuisine_count", F.size(F.array_remove(parts, "")))
sdf = sdf.withColumn("type", F.when(F.col("type").isNull(), "Unknown").otherwise(F.col("type")))

# Regression
reg_df = sdf.select("rating_number","cost","votes","cuisine_count","type").na.drop(subset=["rating_number"])
pipe_r = Pipeline(stages=[
    Imputer(inputCols=["cost","votes","cuisine_count"], outputCols=["cost_i","votes_i","cuisine_count_i"], strategy="median"),
    StringIndexer(inputCol="type", outputCol="type_idx", handleInvalid="keep"),
    OneHotEncoder(inputCol="type_idx", outputCol="type_ohe"),
    VectorAssembler(inputCols=["cost_i","votes_i","cuisine_count_i","type_ohe"], outputCol="features"),
    LinearRegression(featuresCol="features", labelCol="rating_number")
])
train_r, test_r = reg_df.randomSplit([0.8, 0.2], seed=42)
mse = RegressionEvaluator(labelCol="rating_number", predictionCol="prediction", metricName="mse") \
        .evaluate(pipe_r.fit(train_r).transform(test_r))
print(f"\nPySpark Regression — Test MSE: {mse:.4f}")

# Classification
txt = F.lower(F.trim(F.col("rating_text")))
label = F.when(txt.isin("poor","average"), 0.0).when(txt.isin("good","very good","excellent"), 1.0)
clf_df = sdf.withColumn("label", label).select("label","cost","votes","cuisine_count","type").na.drop(subset=["label"])

pipe_c = Pipeline(stages=[
    Imputer(inputCols=["cost","votes","cuisine_count"], outputCols=["cost_i","votes_i","cuisine_count_i"], strategy="median"),
    StringIndexer(inputCol="type", outputCol="type_idx", handleInvalid="keep"),
    OneHotEncoder(inputCol="type_idx", outputCol="type_ohe"),
    VectorAssembler(inputCols=["cost_i","votes_i","cuisine_count_i","type_ohe"], outputCol="features"),
    LogisticRegression(featuresCol="features", labelCol="label")
])
pred = pipe_c.fit(clf_df.randomSplit([0.8, 0.2], seed=42)[0]).transform(clf_df.randomSplit([0.8, 0.2], seed=42)[1]) \
             .select("label","prediction")

tp = pred.filter((F.col("label")==1.0) & (F.col("prediction")==1.0)).count()
fp = pred.filter((F.col("label")==0.0) & (F.col("prediction")==1.0)).count()
tn = pred.filter((F.col("label")==0.0) & (F.col("prediction")==0.0)).count()
fn = pred.filter((F.col("label")==1.0) & (F.col("prediction")==0.0)).count()
prec = tp/(tp+fp) if (tp+fp)>0 else 0.0
rec  = tp/(tp+fn) if (tp+fn)>0 else 0.0
f1   = (2*prec*rec)/(prec+rec) if (prec+rec)>0 else 0.0

print("\nPySpark Classification — Confusion [[TN FP] ; [FN TP]]")
print(f"[[{tn} {fp}] ; [{fn} {tp}]]")
print(f"Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f}")
