In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=pd.read_csv("clean_mall_data.csv")
dataset

Unnamed: 0,Shop_ID,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales,Rent
0,S001,2,2,2,352,6311,2,0,962727,67310.0
1,S002,1,1,4,2961,802,16,0,549722,45221.0
2,S003,4,4,0,1402,1138,9,0,1783693,3704.0
3,S004,1,4,4,1849,555,12,0,608924,68503.0
4,S005,1,0,3,1400,9217,13,1,1834033,85666.0
...,...,...,...,...,...,...,...,...,...,...
195,S196,4,3,4,1430,845,3,1,1352549,44571.0
196,S197,4,3,4,2205,9054,10,0,1381673,65858.0
197,S198,3,1,4,774,2943,10,1,1475402,41831.0
198,S199,1,0,4,2040,8283,9,0,1612833,49625.0


In [3]:
dataset.shape

(200, 10)

In [4]:
dataset.isnull().sum()

Shop_ID             0
Mall_Name           0
City                0
Floor               0
Shop_Size_sqft      0
Footfall_per_day    0
Nearby_Brands       0
Has_Food_Court      0
Monthly_Sales       0
Rent                0
dtype: int64

In [5]:
dataset=dataset.drop("Shop_ID",axis=1)
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales,Rent
0,2,2,2,352,6311,2,0,962727,67310.0
1,1,1,4,2961,802,16,0,549722,45221.0
2,4,4,0,1402,1138,9,0,1783693,3704.0
3,1,4,4,1849,555,12,0,608924,68503.0
4,1,0,3,1400,9217,13,1,1834033,85666.0
...,...,...,...,...,...,...,...,...,...
195,4,3,4,1430,845,3,1,1352549,44571.0
196,4,3,4,2205,9054,10,0,1381673,65858.0
197,3,1,4,774,2943,10,1,1475402,41831.0
198,1,0,4,2040,8283,9,0,1612833,49625.0


In [6]:
dataset.columns

Index(['Mall_Name', 'City', 'Floor', 'Shop_Size_sqft', 'Footfall_per_day',
       'Nearby_Brands', 'Has_Food_Court', 'Monthly_Sales', 'Rent'],
      dtype='object')

In [7]:
indep_X=dataset.drop("Rent", axis=1)
indep_X

Unnamed: 0,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales
0,2,2,2,352,6311,2,0,962727
1,1,1,4,2961,802,16,0,549722
2,4,4,0,1402,1138,9,0,1783693
3,1,4,4,1849,555,12,0,608924
4,1,0,3,1400,9217,13,1,1834033
...,...,...,...,...,...,...,...,...
195,4,3,4,1430,845,3,1,1352549
196,4,3,4,2205,9054,10,0,1381673
197,3,1,4,774,2943,10,1,1475402
198,1,0,4,2040,8283,9,0,1612833


In [8]:
dep_Y=dataset["Rent"]
dep_Y

0      67310.0
1      45221.0
2       3704.0
3      68503.0
4      85666.0
        ...   
195    44571.0
196    65858.0
197    41831.0
198    49625.0
199    54938.0
Name: Rent, Length: 200, dtype: float64

## Model Creation

In [9]:
def selectkbest(indep_X,dep_Y,n):
    selector = SelectKBest(score_func=f_regression, k=n)
    X_new = selector.fit_transform(indep_X, dep_Y)
    return X_new

def split_scalar(indep_X,dep_Y):
      
    X_train,X_test,Y_train,Y_test=train_test_split(indep_X,dep_Y, test_size=0.30,random_state=0)
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,Y_train,Y_test

def r2_prediction(regressor,X_test,Y_test):
     y_pred = regressor.predict(X_test)
     from sklearn.metrics import r2_score
     r2=r2_score(Y_test,y_pred)
     return r2
 
def Linear(X_train,Y_train,X_test):       
        
        from sklearn.linear_model import Ridge
        param_dist = {'alpha': uniform(0, 10)}
        search = RandomizedSearchCV(
            Ridge(),
            param_distributions=param_dist,
            n_iter=10,
            scoring='r2',
            cv=3,
            random_state=42
        )
        search.fit(X_train, Y_train)
        best_model = search.best_estimator_
        r2 = r2_prediction(best_model, X_test, Y_test)
        return r2  
    
def svm_linear(X_train,Y_train,X_test):
                
        from sklearn.svm import SVR
        param_dist = {'C': np.logspace(-2, 2, 5),'epsilon': [0.01, 0.1, 0.2],'kernel': ['linear']}
        search = RandomizedSearchCV(SVR(), param_distributions=param_dist, n_iter=10, cv=3,
                                scoring='r2', random_state=42, n_jobs=-1)
        search.fit(X_train, Y_train)
        best_model = search.best_estimator_
        r2=r2_prediction(best_model, X_test, Y_test)
        return  r2  
    
def svm_NL(X_train,Y_train,X_test):
                
        from sklearn.svm import SVR
        param_dist = {'C': np.logspace(-2, 2, 5),'gamma': ['scale', 0.01, 0.1, 1],'epsilon': [0.01, 0.1, 0.2],'kernel': ['rbf']}
        search = RandomizedSearchCV(SVR(), param_distributions=param_dist, n_iter=10, cv=3,
                                scoring='r2', random_state=42, n_jobs=-1)
        search.fit(X_train, Y_train)
        best_model = search.best_estimator_
        r2=r2_prediction(best_model,X_test,Y_test)
        return  r2       

def Decision(X_train,Y_train,X_test):
                
        from sklearn.tree import DecisionTreeRegressor
        param_dist = {'max_depth': [None, 5, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4],
                      'criterion': ['squared_error', 'friedman_mse']}
        search = RandomizedSearchCV(DecisionTreeRegressor(random_state=42),param_distributions=param_dist, n_iter=10, cv=3,
                                    scoring='r2', random_state=42, n_jobs=-1)
        search.fit(X_train, Y_train)
        best_model = search.best_estimator_
        r2=r2_prediction(best_model,X_test,Y_test)
        return  r2       

def random(X_train,Y_train,X_test):       
        
        from sklearn.ensemble import RandomForestRegressor
        param_dist = {'n_estimators': [50, 100, 200],'max_depth': [None, 10, 20],'min_samples_split': [2, 5],'min_samples_leaf': [1, 2],
                       'bootstrap': [True, False]}   
        search = RandomizedSearchCV(RandomForestRegressor(random_state=42),param_distributions=param_dist, n_iter=10, cv=3,
                                    scoring='r2', random_state=42, n_jobs=-1)
        search.fit(X_train, Y_train)
        best_model = search.best_estimator_
        r2=r2_prediction(best_model,X_test,Y_test)
        return  r2 
        
def selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf): 
    
    dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'])

    for number,idex in enumerate(dataframe.index):
        
        dataframe['Linear'][idex]=acclin[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

In [25]:
kbest=selectkbest(indep_X,dep_Y,9)      
X_train, X_test, Y_train, Y_test=split_scalar(kbest,dep_Y)  

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

In [26]:
r2_lin=Linear(X_train,Y_train,X_test)
acclin.append(r2_lin)
    
r2_sl=svm_linear(X_train,Y_train,X_test)    
accsvml.append(r2_sl)
    
r2_NL=svm_NL(X_train,Y_train,X_test)
accsvmnl.append(r2_NL)
    
r2_d=Decision(X_train,Y_train,X_test)
accdes.append(r2_d)
    
r2_r=random(X_train,Y_train,X_test)
accrf.append(r2_r)

    
result=selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf)

In [12]:
result1 = result
result1
#4

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.212954,0.097613,-0.005042,-0.006418,0.147429


In [15]:
result2 = result
result2
#5

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.24524,0.114094,-0.004692,-0.184979,0.125163


In [18]:
result3 = result
result3
#7

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
ChiSquare,0.23315,0.11545,-0.009554,-0.166505,0.087239


In [19]:
results = [result1, result2, result3]   # add as many as we have

summary = []

for i, df in enumerate(results, start=5):   # since numbering starts at #5
    best_model = df.max().idxmax()          # best column name (model)
    best_score = df.max().max()             # best R² or accuracy
    summary.append({
        "Result_No": f"# {i}",
        "Best_Model": best_model,
        "Best_Score": best_score
    })

summary_df = pd.DataFrame(summary)
print(summary_df)

  Result_No Best_Model  Best_Score
0       # 5     Linear    0.212954
1       # 6     Linear    0.245240
2       # 7     Linear    0.233150
