In [34]:
# IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
# IMPORTING THE DATASET
data= pd.read_csv('Company_Data.csv')
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [5]:
data['ShelveLoc']=data['ShelveLoc'].map({'Medium':0,'Bad':1,'Good':2})
data['Urban']=data['Urban'].map({"Yes":1,"No":1})
data['US']=data['US'].map({"Yes":1,"No":0})
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,1,42,17,1,1
1,11.22,111,48,16,260,83,2,65,10,1,1
2,10.06,113,35,10,269,80,0,59,12,1,1
3,7.40,117,100,4,466,97,0,55,14,1,1
4,4.15,141,64,3,340,128,1,38,13,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,2,33,14,1,1
396,6.14,139,23,3,37,120,0,55,11,1,1
397,7.41,162,26,12,368,159,0,40,18,1,1
398,5.94,100,79,7,284,95,1,50,12,1,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    int64  
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    int64  
 10  US           400 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 34.5 KB


In [8]:
# TOP SALES
data.sort_values(by='Sales',ascending=False).head(10)

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
376,16.27,141,60,19,319,92,2,44,11,1,1
316,15.63,122,36,5,369,72,2,35,10,1,1
25,14.9,139,32,0,176,82,2,54,11,1,0
367,14.37,95,106,0,256,53,2,52,17,1,0
18,13.91,110,110,0,408,68,2,46,17,1,1
30,13.55,125,94,0,447,89,2,30,12,1,0
352,13.44,133,103,14,288,122,2,61,17,1,1
68,13.39,149,69,20,366,134,2,60,13,1,1
357,13.36,103,73,3,276,72,0,34,15,1,1
193,13.28,139,70,7,71,96,2,61,10,1,1


In [9]:
X=data.iloc[:,1:]
Y=data.iloc[:,0]

In [10]:
# SPLITTING DATA INTO TRAIN DATA & TEST DATA
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=4)

In [11]:
# HYPER PARAMETER TUNING
param_grid =[{'n_estimators':[3,10,30,100,1200],'max_features':[2,4,6,8],'max_depth':[3,4,6]},{'bootstrap':[False,True],'max_features':[2,3,4,6,8],'n_estimators':[3,10,30,100,120],'max_depth':[3,4,6,8]},]
RF_params= GridSearchCV(estimator=RandomForestRegressor(),param_grid=param_grid,n_jobs=4)
RF_params.fit(X_train,Y_train)

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=4,
             param_grid=[{'max_depth': [3, 4, 6], 'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30, 100, 1200]},
                         {'bootstrap': [False, True], 'max_depth': [3, 4, 6, 8],
                          'max_features': [2, 3, 4, 6, 8],
                          'n_estimators': [3, 10, 30, 100, 120]}])

In [12]:
RF_params.best_params_,RF_params.best_score_

({'bootstrap': True, 'max_depth': 8, 'max_features': 8, 'n_estimators': 120},
 0.6487992096587701)

In [24]:
# MODEL BUILDING
regressor= RandomForestRegressor(n_estimators=1200,bootstrap=True)
regressor.fit(X_train,Y_train)

RandomForestRegressor(n_estimators=1200)

In [33]:
# ACCURACY OF MODEL ON TRAIN DATA
pred_train=regressor.predict(X_train)
mae_train=mean_absolute_error(Y_train,pred_train)
mae_train

0.45880723958333725

In [37]:
score_train=r2_score(Y_train,pred_train)*100
score_train

95.71109807022098

 - Accuracy on Train Data is 95.7%

In [38]:
# ACCURACY OF MODEL ON TEST DATA
pred_test=regressor.predict(X_test)
mae_test=mean_absolute_error(Y_test,pred_test)
mae_test

1.3684045833333354

In [39]:
score_test=r2_score(Y_test,pred_test)*100
score_test

68.12364018552843

 - Accuracy on Test Data is 68.12%

In [40]:
# MODEL VALIDATION (CROSS VALIDATION SCORE)
kfold=KFold(n_splits=10,random_state=7,shuffle=True)
cross_val_score(regressor,X,Y,cv=kfold).mean()*100

69.54790332302271

In [41]:
# CONVERTING SALES VARIABLE INTO CATEGORICAL FEATURE
def get_categorical_data(X):
    sales_status=[]
    for i in X:
        if i>=9:
            sales_status.append('1')
        elif (i<9):
            sales_status.append('0')
    return sales_status

In [43]:
data['sales_status']=get_categorical_data(data['Sales'])
data

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US,sales_status
0,9.50,138,73,11,276,120,1,42,17,1,1,1
1,11.22,111,48,16,260,83,2,65,10,1,1,1
2,10.06,113,35,10,269,80,0,59,12,1,1,1
3,7.40,117,100,4,466,97,0,55,14,1,1,0
4,4.15,141,64,3,340,128,1,38,13,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,2,33,14,1,1,1
396,6.14,139,23,3,37,120,0,55,11,1,1,0
397,7.41,162,26,12,368,159,0,40,18,1,1,0
398,5.94,100,79,7,284,95,1,50,12,1,1,0


In [45]:
Xn = data.drop(columns=['Sales','sales_status'])
Yn = data['sales_status']
Xn_train,Xn_test,Yn_train,Yn_test = train_test_split(Xn,Yn,test_size=0.15,random_state=4)

In [48]:
# HYPERPARAMETER TUNING
CF_params= GridSearchCV(estimator=RandomForestClassifier(),param_grid=param_grid,n_jobs=4)
CF_params.fit(Xn,Yn)
CF_params.best_estimator_,CF_params.best_score_

(RandomForestClassifier(max_depth=8, max_features=3, n_estimators=120),
 0.8574999999999999)

In [49]:
# MODEL BUILDING
classifier = RandomForestClassifier()
classifier.fit(Xn_train,Yn_train)

RandomForestClassifier()

In [53]:
predn_test=classifier.predict(Xn_test)
accuracy_score(predn_test,Yn_test)

0.9166666666666666

 - Accuracy of this Model is 91.66%

In [52]:
# MODEL VALIDATION (CROSS VALIDATION SCORE)
kfold=KFold(n_splits=10,random_state=7,shuffle=True)
cross_val_score(classifier,Xn,Yn,cv=kfold).mean()*100

86.25