In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [11]:
dataset=pd.read_csv("clean_mall_data.csv")
dataset

Unnamed: 0,Shop_ID,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales,Rent
0,S001,2,2,2,352,6311,2,0,962727,67310.0
1,S002,1,1,4,2961,802,16,0,549722,45221.0
2,S003,4,4,0,1402,1138,9,0,1783693,3704.0
3,S004,1,4,4,1849,555,12,0,608924,68503.0
4,S005,1,0,3,1400,9217,13,1,1834033,85666.0
...,...,...,...,...,...,...,...,...,...,...
195,S196,4,3,4,1430,845,3,1,1352549,44571.0
196,S197,4,3,4,2205,9054,10,0,1381673,65858.0
197,S198,3,1,4,774,2943,10,1,1475402,41831.0
198,S199,1,0,4,2040,8283,9,0,1612833,49625.0


In [12]:
dataset.shape

(200, 10)

In [13]:
dataset.isnull().sum()

Shop_ID             0
Mall_Name           0
City                0
Floor               0
Shop_Size_sqft      0
Footfall_per_day    0
Nearby_Brands       0
Has_Food_Court      0
Monthly_Sales       0
Rent                0
dtype: int64

In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Shop_ID           200 non-null    object 
 1   Mall_Name         200 non-null    int64  
 2   City              200 non-null    int64  
 3   Floor             200 non-null    int64  
 4   Shop_Size_sqft    200 non-null    int64  
 5   Footfall_per_day  200 non-null    int64  
 6   Nearby_Brands     200 non-null    int64  
 7   Has_Food_Court    200 non-null    int64  
 8   Monthly_Sales     200 non-null    int64  
 9   Rent              200 non-null    float64
dtypes: float64(1), int64(8), object(1)
memory usage: 15.8+ KB


In [15]:
dataset=dataset.drop("Shop_ID",axis=1)
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)
dataset

Unnamed: 0,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales,Rent
0,2,2,2,352,6311,2,0,962727,67310.0
1,1,1,4,2961,802,16,0,549722,45221.0
2,4,4,0,1402,1138,9,0,1783693,3704.0
3,1,4,4,1849,555,12,0,608924,68503.0
4,1,0,3,1400,9217,13,1,1834033,85666.0
...,...,...,...,...,...,...,...,...,...
195,4,3,4,1430,845,3,1,1352549,44571.0
196,4,3,4,2205,9054,10,0,1381673,65858.0
197,3,1,4,774,2943,10,1,1475402,41831.0
198,1,0,4,2040,8283,9,0,1612833,49625.0


In [16]:
dataset.columns

Index(['Mall_Name', 'City', 'Floor', 'Shop_Size_sqft', 'Footfall_per_day',
       'Nearby_Brands', 'Has_Food_Court', 'Monthly_Sales', 'Rent'],
      dtype='object')

In [17]:
indep_X=dataset.drop("Rent", axis=1)
indep_X

Unnamed: 0,Mall_Name,City,Floor,Shop_Size_sqft,Footfall_per_day,Nearby_Brands,Has_Food_Court,Monthly_Sales
0,2,2,2,352,6311,2,0,962727
1,1,1,4,2961,802,16,0,549722
2,4,4,0,1402,1138,9,0,1783693
3,1,4,4,1849,555,12,0,608924
4,1,0,3,1400,9217,13,1,1834033
...,...,...,...,...,...,...,...,...
195,4,3,4,1430,845,3,1,1352549
196,4,3,4,2205,9054,10,0,1381673
197,3,1,4,774,2943,10,1,1475402
198,1,0,4,2040,8283,9,0,1612833


In [18]:
dep_Y=dataset["Rent"]
dep_Y

0      67310.0
1      45221.0
2       3704.0
3      68503.0
4      85666.0
        ...   
195    44571.0
196    65858.0
197    41831.0
198    49625.0
199    54938.0
Name: Rent, Length: 200, dtype: float64

## Model Creation

In [19]:
def pca_features(indep_X, n):
    pca = PCA(n_components=n)
    pca_features = pca.fit_transform(indep_X)
    return pca_features, pca

def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test
    
def r2_prediction(regressor,X_test,y_test):
     y_pred = regressor.predict(X_test)
     from sklearn.metrics import r2_score
     r2=r2_score(y_test,y_pred)
     return r2
 
def Linear(X_train,y_train,X_test,y_test):       
        from sklearn.linear_model import Ridge
        params = {'alpha': [0.01, 0.1, 1, 10, 100]}
        grid = GridSearchCV(estimator=Ridge(), param_grid=params, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
        r2 = r2_prediction(grid.best_estimator_, X_test, y_test)
        return  r2   
    
def svm_linear(X_train,y_train,X_test,y_test):
                
        from sklearn.svm import SVR
        params = {'C': [0.1, 1, 10, 100], 'kernel': ['linear']}
        grid = GridSearchCV(estimator=SVR(), param_grid=params, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
        r2 = r2_prediction(grid.best_estimator_, X_test, y_test)
        return  r2  
    
def svm_NL(X_train,y_train,X_test,y_test):
                
        from sklearn.svm import SVR
        params = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['rbf']}
        grid = GridSearchCV(estimator=SVR(), param_grid=params, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
        r2 = r2_prediction(grid.best_estimator_, X_test, y_test)
        return  r2  
     
def Decision(X_train,y_train,X_test,y_test):
              
        from sklearn.tree import DecisionTreeRegressor
        params = {'max_depth': [None, 3, 5, 10], 'min_samples_split': [2, 5, 10]}
        grid = GridSearchCV(estimator=DecisionTreeRegressor(random_state=0), param_grid=params, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
        r2=r2_prediction(grid.best_estimator_,X_test,y_test)
        return  r2  
     
def random(X_train,y_train,X_test,y_test):       
        
        from sklearn.ensemble import RandomForestRegressor
        params = {'n_estimators': [10, 50, 100], 'max_depth': [None, 5, 10], 'min_samples_split': [2, 5]}
        grid = GridSearchCV(estimator=RandomForestRegressor(random_state=0), param_grid=params, scoring='r2', cv=5)
        grid.fit(X_train, y_train)
        r2 = r2_prediction(grid.best_estimator_, X_test, y_test)
        return  r2 
        
def PCA_regression(acclin, accsvml, accsvmnl, accdes, accrf, index_names):
    dataframe = pd.DataFrame(index=index_names,
                             columns=['Linear', 'SVMl', 'SVMnl', 'Decision', 'Random'])
    for number, idex in enumerate(dataframe.index):
        dataframe['Linear'][idex] = acclin[number]
        dataframe['SVMl'][idex] = accsvml[number]
        dataframe['SVMnl'][idex] = accsvmnl[number]
        dataframe['Decision'][idex] = accdes[number]
        dataframe['Random'][idex] = accrf[number]
    return dataframe

In [22]:
kbest = [2, 3, 5, 6, 8] 

acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]
index_names=[]

In [23]:
for k in kbest:
    pca_data, pca = pca_features(indep_X, k)
    X_train, X_test, y_train, y_test = split_scalar(pca_data, dep_Y)

    r2_lin = Linear(X_train, y_train, X_test, y_test)
    acclin.append(r2_lin)

    r2_svm_l = svm_linear(X_train, y_train, X_test, y_test)
    accsvml.append(r2_svm_l)

    r2_svm_nl = svm_NL(X_train, y_train, X_test, y_test)
    accsvmnl.append(r2_svm_nl)

    r2_d = Decision(X_train, y_train, X_test, y_test)
    accdes.append(r2_d)

    r2_r = random(X_train, y_train, X_test, y_test)
    accrf.append(r2_r)

    index_names.append(f'PCA_{k}')

In [24]:
result = PCA_regression(acclin, accsvml, accsvmnl, accdes, accrf, index_names)
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
PCA_2,0.06118,0.011406,-0.020982,-0.085937,-0.023898
PCA_3,0.198645,0.076955,-0.019833,-0.008697,0.105562
PCA_5,0.22846,0.089114,-0.020315,-0.126417,0.019328
PCA_6,0.196274,0.086964,-0.020694,-0.26776,0.023948
PCA_8,0.167547,0.092507,-0.020994,-0.278576,-0.049534


In [25]:
best_score = result.max().max()                  # highest score in the table
best_model = result.max().idxmax()               # column (model) with best score
best_pca = result.loc[:, best_model].idxmax()    # row (PCA component) with best score

print("Best Model and PCA Component:")
print(f"Best Model     : {best_model}")
print(f"Best PCA       : {best_pca}")
print(f"Best R² Score  : {best_score:.4f}")

Best Model and PCA Component:
Best Model     : Linear
Best PCA       : PCA_5
Best R² Score  : 0.2285
