In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=pd.read_csv("clean_mall_data.csv")
dataset

Unnamed: 0,Shop_ID,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales,Rent
0,S001,2,2,2,352,6311,2,0,962727,67310.0
1,S002,1,1,4,2961,802,16,0,549722,45221.0
2,S003,4,4,0,1402,1138,9,0,1783693,3704.0
3,S004,1,4,4,1849,555,12,0,608924,68503.0
4,S005,1,0,3,1400,9217,13,1,1834033,85666.0
...,...,...,...,...,...,...,...,...,...,...
195,S196,4,3,4,1430,845,3,1,1352549,44571.0
196,S197,4,3,4,2205,9054,10,0,1381673,65858.0
197,S198,3,1,4,774,2943,10,1,1475402,41831.0
198,S199,1,0,4,2040,8283,9,0,1612833,49625.0


In [3]:
dataset.shape

(200, 10)

In [4]:
dataset.isnull().sum()

Shop_ID             0
Mall_Name           0
City                0
Floor               0
Shop_Size_sqft      0
Footfall_per_day    0
Nearby_Brands       0
Has_Food_Court      0
Monthly_Sales       0
Rent                0
dtype: int64

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Shop_ID           200 non-null    object 
 1   Mall_Name         200 non-null    int64  
 2   City              200 non-null    int64  
 3   Floor             200 non-null    int64  
 4   Shop_Size_sqft    200 non-null    int64  
 5   Footfall_per_day  200 non-null    int64  
 6   Nearby_Brands     200 non-null    int64  
 7   Has_Food_Court    200 non-null    int64  
 8   Monthly_Sales     200 non-null    int64  
 9   Rent              200 non-null    float64
dtypes: float64(1), int64(8), object(1)
memory usage: 15.8+ KB


In [6]:
dataset=dataset.drop("Shop_ID",axis=1)
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales,Rent
0,2,2,2,352,6311,2,0,962727,67310.0
1,1,1,4,2961,802,16,0,549722,45221.0
2,4,4,0,1402,1138,9,0,1783693,3704.0
3,1,4,4,1849,555,12,0,608924,68503.0
4,1,0,3,1400,9217,13,1,1834033,85666.0
...,...,...,...,...,...,...,...,...,...
195,4,3,4,1430,845,3,1,1352549,44571.0
196,4,3,4,2205,9054,10,0,1381673,65858.0
197,3,1,4,774,2943,10,1,1475402,41831.0
198,1,0,4,2040,8283,9,0,1612833,49625.0


In [7]:
dataset.columns

Index(['Mall_Name', 'City', 'Floor', 'Shop_Size_sqft', 'Footfall_per_day',
       'Nearby_Brands', 'Has_Food_Court', 'Monthly_Sales', 'Rent'],
      dtype='object')

In [8]:
indep_X=dataset.drop("Rent", axis=1)
indep_X

Unnamed: 0,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales
0,2,2,2,352,6311,2,0,962727
1,1,1,4,2961,802,16,0,549722
2,4,4,0,1402,1138,9,0,1783693
3,1,4,4,1849,555,12,0,608924
4,1,0,3,1400,9217,13,1,1834033
...,...,...,...,...,...,...,...,...
195,4,3,4,1430,845,3,1,1352549
196,4,3,4,2205,9054,10,0,1381673
197,3,1,4,774,2943,10,1,1475402
198,1,0,4,2040,8283,9,0,1612833


In [9]:
dep_Y=dataset["Rent"]
dep_Y

0      67310.0
1      45221.0
2       3704.0
3      68503.0
4      85666.0
        ...   
195    44571.0
196    65858.0
197    41831.0
198    49625.0
199    54938.0
Name: Rent, Length: 200, dtype: float64

## Model Creation

In [10]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test
    
def r2_prediction(regressor,X_test,y_test):
     y_pred = regressor.predict(X_test)
     from sklearn.metrics import r2_score
     r2=r2_score(y_test,y_pred)
     return r2
 
def Linear(X_train,y_train,X_test):       
      
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        param_grid = {}
        grid = GridSearchCV(regressor, param_grid, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        r2 = r2_prediction(best_model, X_test, y_test)
        return  r2   
    
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import LinearSVR
        param_grid = {
            'C': [0.1, 1, 10, 50, 100],
            'epsilon': [0.0, 0.1, 0.2]
        }
        regressor = LinearSVR(random_state=0)
        grid = GridSearchCV(regressor, param_grid, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
    
        best_model = grid.best_estimator_
        r2 = r2_prediction(best_model, X_test, y_test)
        return  r2  
    
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVR
        param_grid = {
            'C': [1, 10, 50, 100],
            'gamma': [0.001, 0.01, 0.05, 0.1],
            'kernel': ['rbf']
        }
        regressor = SVR()
        grid = GridSearchCV(regressor, param_grid, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
    
        best_model = grid.best_estimator_
        r2 = r2_prediction(best_model, X_test, y_test)
        return  r2  
     
def Decision(X_train,y_train,X_test):
        
        
        from sklearn.tree import DecisionTreeRegressor
        param_grid = {
            'max_depth': [3, 5, 10, None],
            'min_samples_split': [2, 5, 10]
        }
        regressor = DecisionTreeRegressor(random_state=0)
        grid = GridSearchCV(regressor, param_grid, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
    
        best_model = grid.best_estimator_
        r2 = r2_prediction(best_model, X_test, y_test)
        return  r2  
     

def random(X_train,y_train,X_test):       
        
        from sklearn.ensemble import RandomForestRegressor
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15, None],
            'min_samples_split': [2, 5, 10]
        }
        regressor = RandomForestRegressor(random_state=0)
        grid = GridSearchCV(regressor, param_grid, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
    
        best_model = grid.best_estimator_
        r2 = r2_prediction(best_model, X_test, y_test)
        return  r2 

def gradient(X_train,y_train,X_test):
    from sklearn.ensemble import GradientBoostingRegressor
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [2, 3, 4, 5]
    }
    regressor = GradientBoostingRegressor(random_state=0)
    grid = GridSearchCV(regressor, param_grid, scoring='r2', cv=5)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    r2 = r2_prediction(best_model, X_test, y_test)
    return r2
    
def rfeFeature(indep_X,dep_Y,n):
        rfelist=[]
        
        from sklearn.linear_model import LinearRegression
        lin = LinearRegression()
        
        from sklearn.svm import LinearSVR
        SVRl = LinearSVR()
        
        from sklearn.svm import SVR
        SVRnl = SVR(kernel = 'linear')
        
        from sklearn.tree import DecisionTreeRegressor
        dec = DecisionTreeRegressor(random_state = 0)
        
        from sklearn.ensemble import RandomForestRegressor
        rf = RandomForestRegressor(n_estimators = 100, random_state = 0)

        from sklearn.ensemble import GradientBoostingRegressor
        gbr = GradientBoostingRegressor(n_estimators=100, random_state=0)
        
        rfemodellist=[lin,SVRl,SVRnl,dec,rf,gbr] 
        for i in   rfemodellist:
            print(i)
            log_rfe = RFE(estimator=i, n_features_to_select=n)
            log_fit = log_rfe.fit(indep_X, dep_Y)
            log_rfe_feature=log_fit.transform(indep_X)
            rfelist.append(log_rfe_feature)
        return rfelist
    
def rfe_regression(acclog,accsvml,accsvmnl,accdes,accrf,accgb): 
    
    rfedataframe=pd.DataFrame(index=['Linear','SVMl','SVMNl','Random','DecisionTree','Gradient'],columns=['Linear','SVMl','SVMNl',
                                                                                        'Decision','Random','Gradient'])

    for number,idex in enumerate(rfedataframe.index):
        
        rfedataframe['Linear'][idex]=acclog[number]       
        rfedataframe['SVMl'][idex]=accsvml[number]
        rfedataframe['SVMNl'][idex]=accsvml[number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=accrf[number]
        rfedataframe['Gradient'][idex]=accrf[number]
    return rfedataframe

In [17]:
rfelist=rfeFeature(indep_X,dep_Y,7)       

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]
accgb=[]

LinearRegression()
LinearSVR()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(random_state=0)
GradientBoostingRegressor(random_state=0)


In [18]:
for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)  
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
    
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
    
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)

    r2_r=gradient(X_train,y_train,X_test)
    accgb.append(r2_r)
    
    
result=rfe_regression(acclin,accsvml,accsvmnl,accdes,accrf,accgb)

In [13]:
result1 = result
result1
#5

Unnamed: 0,Linear,SVMl,SVMNl,Decision,Random,Gradient
Linear,-0.06288,-2.088764,-2.088764,0.029438,0.006111,0.006111
SVMl,0.15121,-2.130667,-2.130667,-0.248457,0.008033,0.008033
SVMNl,0.1865,-2.050927,-2.050927,-0.447964,-0.01434,-0.01434
Random,0.187926,-2.102256,-2.102256,-0.281935,0.106983,0.106983
DecisionTree,0.230164,-2.03106,-2.03106,-0.201119,0.1024,0.1024
Gradient,0.187926,-2.102256,-2.102256,-0.281935,0.106983,0.106983


In [16]:
result2 = result
result2
#6

Unnamed: 0,Linear,SVMl,SVMNl,Decision,Random,Gradient
Linear,0.116926,-2.036921,-2.036921,-0.171851,-0.042057,-0.042057
SVMl,0.212238,-2.031158,-2.031158,-0.201119,0.078552,0.078552
SVMNl,0.151088,-2.044373,-2.044373,-0.359863,0.014855,0.014855
Random,0.180614,-2.091874,-2.091874,-0.313153,0.093166,0.093166
DecisionTree,0.186254,-2.024489,-2.024489,-0.201119,0.094301,0.094301
Gradient,0.180614,-2.091874,-2.091874,-0.313153,0.093166,0.093166


In [19]:
result3 = result
result3
#7

Unnamed: 0,Linear,SVMl,SVMNl,Decision,Random,Gradient
Linear,0.1374,-2.044466,-2.044466,-0.359863,-0.012175,-0.012175
SVMl,0.1374,-2.044466,-2.044466,-0.359863,-0.012175,-0.012175
SVMNl,0.1374,-2.044466,-2.044466,-0.359863,-0.012175,-0.012175
Random,0.170685,-2.092885,-2.092885,-0.313153,0.073406,0.073406
DecisionTree,0.171329,-2.024587,-2.024587,-0.201119,0.070951,0.070951
Gradient,0.170685,-2.092885,-2.092885,-0.313153,0.073406,0.073406


In [20]:
# --- Store all results DataFrames here ---
results = [result1, result2, result3]   # add as many as we have

summary = []

for i, df in enumerate(results, start=5):   # since numbering starts at #5
    best_model = df.max().idxmax()          # best column name (model)
    best_score = df.max().max()             # best R² or accuracy
    summary.append({
        "Result_No": f"# {i}",
        "Best_Model": best_model,
        "Best_Score": best_score
    })

summary_df = pd.DataFrame(summary)
print(summary_df)

  Result_No Best_Model  Best_Score
0       # 5     Linear    0.230164
1       # 6     Linear    0.212238
2       # 7     Linear    0.171329
