## Import Necessary Modules 

In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd



#import the necessary modelling algos.

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
 

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor


#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification

## Loading the Data File  

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data = train
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
print(data.shape , train.shape, test.shape)

(8523, 12) (8523, 12) (5681, 11)


In [4]:
data.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [5]:
data['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [6]:
data['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True)
data['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True)

data['Item_Fat_Content'].unique()


array(['Low Fat', 'Regular'], dtype=object)

In [7]:
data.isnull().sum() # in normal way finding no.of null values in test data

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

### replacing mean in the place of Null values in Item weight 

In [8]:
data["Item_Weight"] = data["Item_Weight"].fillna(data["Item_Weight"].mean())


In [9]:
data['Outlet_Size'].unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [10]:
data["Outlet_Size"] = data["Outlet_Size"].fillna("Medium") 


### Checking for Null values again

In [11]:
data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [12]:
data.columns.duplicated().sum()

0

In [13]:
data.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

### Finding Unique Terms in data set 

In [14]:
data.nunique(axis = 'rows')

Item_Identifier              1559
Item_Weight                   416
Item_Fat_Content                2
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [15]:
data['Item_Outlet_Sales'].head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

### Splitting data into Train and Test data sets  

In [16]:
x = data[['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']]
y = data.Item_Outlet_Sales
x = pd.get_dummies(x)
print("Total features: {}".format(x.shape[1]))

Total features: 28


In [17]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.30, random_state = 0)


print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(5966, 28) (2557, 28) (5966,) (2557,)


# Applying Machine Learning Algorithms  



In [18]:
Regressors = [LinearRegression(),
             KNeighborsRegressor(n_neighbors=20),
             SVR(kernel='rbf'),
             DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)]


Regressor_names = ['Linear Regression',
                  'KNN',
                  'Support Vector Regression',
                  'Decision Tree Regression']

In [19]:
rmse=[]
d={}

for i in range(len(Regressors)):
    rgr=Regressors[i]
    rgr.fit(train_x,train_y)
    y_pred = rgr.predict(test_x)
    mse = mean_squared_error(test_y, y_pred)
    rmse.append(np.sqrt(mse))
    
     
d={'Modelling Algorithm': Regressor_names,'RMSE':rmse}
d

{'Modelling Algorithm': ['Linear Regression',
  'KNN',
  'Support Vector Regression',
  'Decision Tree Regression'],
 'RMSE': [1519.445722431191,
  1560.1768541560475,
  1787.772204001196,
  1519.8488574304517]}

In [20]:
result = pd.DataFrame(d)

In [21]:
result

Unnamed: 0,Modelling Algorithm,RMSE
0,Linear Regression,1519.445722
1,KNN,1560.176854
2,Support Vector Regression,1787.772204
3,Decision Tree Regression,1519.848857
