## Import Necessary Modules 

In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd



#import the necessary modelling algos.

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
 

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification

## Loading the Data File  

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data = train
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
print(data.shape , train.shape, test.shape)

(8523, 12) (8523, 12) (5681, 11)


In [4]:
data.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [5]:
data['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [6]:
train['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True)
test['Item_Fat_Content'].replace(['low fat','LF','reg'],['Low Fat','Low Fat','Regular'],inplace = True)

test['Item_Fat_Content'].unique()


array(['Low Fat', 'Regular'], dtype=object)

In [7]:
data.isnull().sum() # in normal way finding no.of null values in test data

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

### replacing mean in the place of Null values in Item weight 

In [8]:
data["Item_Weight"] = data["Item_Weight"].fillna(data["Item_Weight"].mean())


In [9]:
data['Outlet_Size'].unique()

array(['Medium', nan, 'High', 'Small'], dtype=object)

In [10]:
data["Outlet_Size"] = data["Outlet_Size"].fillna("Medium") 


### Checking for Null values again

In [11]:
train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [12]:
train.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
7,FDP10,12.857645,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,Medium,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,Medium,Tier 2,Supermarket Type1,4710.535


In [13]:
train.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

### Finding Unique Terms in data set 

In [14]:
temp_dict = {}
for i in train.columns:
    print(i)
    temp_dict[i] = len(set(train[i]))
    print(temp_dict[i])
    print("**********") 

Item_Identifier
1559
**********
Item_Weight
416
**********
Item_Fat_Content
2
**********
Item_Visibility
7880
**********
Item_Type
16
**********
Item_MRP
5938
**********
Outlet_Identifier
10
**********
Outlet_Establishment_Year
9
**********
Outlet_Size
3
**********
Outlet_Location_Type
3
**********
Outlet_Type
4
**********
Item_Outlet_Sales
3493
**********


In [15]:
train['Item_Outlet_Sales'].head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

### Encoding data with labelss 

In [16]:
from sklearn.preprocessing import LabelEncoder

data = data.apply(LabelEncoder().fit_transform)


In [17]:
len(train.columns)

12

### Splitting data into Train and Test data sets  

In [18]:
from sklearn.model_selection import train_test_split
from sklearn import metrics # for checking the model accuracy


train, test = train_test_split(data, test_size=0.3) 



print(train.shape, test.shape)

(5966, 12) (2557, 12)


In [19]:
train_x = train[['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']]
train_y = train.Item_Outlet_Sales

test_x = test[['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']]
test_y = test.Item_Outlet_Sales

# Applying Machine Learning Algorithms on this Regression problem 

### Linear Regression

In [20]:
model = LinearRegression()
model.fit(train_x, train_y)

# predicting the  test set results
y_pred = model.predict(test_x)
print(y_pred)

# finding the mean squared error and variance
mse = mean_squared_error(test_y, y_pred)
print('RMSE :', np.sqrt(mse))
print('Variance score: %.2f' % r2_score(test_y, y_pred))



[1784.17239196 1500.20529935 1535.86266578 ... 2367.12809724 1404.39228222
 1238.45614137]
RMSE : 806.3796781245247
Variance score: 0.21


### KNN Regression

In [21]:
model = KNeighborsRegressor(n_neighbors=20)
model.fit(train_x, train_y)

# predicting the  test set results
y_pred = model.predict(test_x)
print(y_pred)

# finding the mean squared error and variance
mse = mean_squared_error(test_y, y_pred)
print('RMSE :', np.sqrt(mse))
print('Variance score: %.2f' % r2_score(test_y, y_pred))


[1554.35 1825.15 1411.1  ... 2456.1  1422.25 1744.35]
RMSE : 780.9787268029847
Variance score: 0.26


### Support Vector Regression

In [22]:
model = SVR(kernel='rbf')
model.fit(train_x, train_y)

# predicting the  test set results
y_pred = model.predict(test_x)
print(y_pred)

# finding the mean squared error and variance
mse = mean_squared_error(test_y, y_pred)
print('RMSE :', np.sqrt(mse))
print('Variance score: %.2f' % r2_score(test_y, y_pred))

[1477.60092944 1418.56529269 1427.70393536 ... 1518.73676388 1408.86162134
 1437.95134844]
RMSE : 896.5979100265183
Variance score: 0.02
