In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy import stats


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
## Importing Datasets
train_data =  pd.read_csv('https://raw.githubusercontent.com/dsrscientist/bigdatamart_rep/master/bigdatamart_Train.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/bigdatamart_rep/master/bigdatamart_Test.csv')

In [None]:
train_data.head()


In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
train_data.describe().T

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()


In [None]:
train_data['source'] = 'train'
test_data['source'] = 'test'
df = pd.concat([train_data,test_data], ignore_index=True)

In [None]:
df.isnull().sum()

In [None]:
for i in train_data.describe().columns:
    sns.distplot(train_data[i].dropna())
    plt.show()

In [None]:
for i in train_data.describe().columns:
    sns.boxplot(train_data[i].dropna())
    plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(train_data.Item_Type)
plt.xticks(rotation=90)

In [None]:
train_data.Item_Type.value_counts()

In [None]:
#Distribution of the Outlet_Size
plt.figure(figsize=(10,8))
sns.countplot(train_data.Outlet_Size)
plt.show()

In [None]:
#Distribution of the Outlet_Location_Type
plt.figure(figsize=(10,8))
sns.countplot(train_data.Outlet_Location_Type)

In [None]:
train_data.Outlet_Location_Type.value_counts()

In [None]:
#Distribution of the Outlet_Type
plt.figure(figsize=(10,8))
sns.countplot(train_data.Outlet_Type)
plt.xticks(rotation=90)

In [None]:
train_data.Outlet_Type.value_counts()

In [None]:

plt.figure(figsize=(10,8))
plt.xlabel("Item_Weight")
plt.ylabel("Item_Outlet_Sales")
plt.title("Itam Weight and Item Outlet Sales")
sns.scatterplot(x='Item_Weight', y='Item_Outlet_Sales', hue='Item_Type',size='Item_Weight',data=train_data)

In [None]:
plt.figure(figsize=(13,9))
plt.xlabel("Item_Visibility")
plt.ylabel("Item_Outlet_Sales")
plt.title("Item Visibility and Item Outlet Sales",fontsize=15)
sns.scatterplot(x="Item_Visibility", y="Item_Outlet_Sales", hue="Item_Type", size= 'Item_Weight',data=train_data)

In [None]:
plt.figure(figsize=(12,7))
plt.xlabel("Item_visibility")
plt.ylabel("Maximum Retail Price")
plt.title("Item_visibility and Maximum Retail Price")
plt.plot(train_data.Item_Visibility, train_data.Item_MRP, ".", alpha=0.3)

In [None]:
Outlet_Type_pivot = train_data.pivot_table(index='Outlet_Type',values='Item_Outlet_Sales', aggfunc=np.median)

Outlet_Type_pivot.plot(kind='bar', color='pink', figsize=(12,8))
plt.xlabel("Outlet_Type")
plt.ylabel("Item_Outlet_Sales")
plt.title("Impact of Outlet_type on Item_Outlet_Sales")
plt.show()

In [None]:
Item_Fat_Content_pivot = train_data.pivot_table(index='Item_Fat_Content', values='Item_Outlet_Sales', aggfunc=np.median)

Item_Fat_Content_pivot.plot(kind='bar',color='green', figsize=(12,7))
plt.xlabel("Item_Fat_Content")
plt.ylabel("Item_Outlet_Sales")
plt.title("Impact of Item_Fat_Content on Item_outlet_Sales")
plt.xticks(rotation=0)
plt.show()

In [None]:
df['Item_Fat_Content'].value_counts()

In [None]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF':'Low Fat','reg':'Regular','low fat':'Low Fat'})

In [None]:
df['Item_Fat_Content'].value_counts()

In [None]:
train_data['Item_Fat_Content'] = train_data['Item_Fat_Content'].replace({'LF':'Low Fat','reg':'Regular','low fat':'Low Fat'})

In [None]:
Item_Fat_Content_pivot = train_data.pivot_table(index='Item_Fat_Content', values='Item_Outlet_Sales', aggfunc=np.median)

Item_Fat_Content_pivot.plot(kind='bar',color='brown', figsize=(12,7))
plt.xlabel("Item_Fat_Content")
plt.ylabel("Item_Outlet_Sales")
plt.title("Impact of Item_Fat_Content on Item_outlet_Sales")
plt.xticks(rotation=0)
plt.show()

In [None]:
train_data.corr()

In [None]:
plt.figure(figsize=(35,15))
sns.heatmap(train_data.corr(), vmax=1,square=True, cmap='viridis')
plt.title("Correlation between different attributes")

In [None]:
#Treating the missing values¶
#Item_Weight-mean of the column
 
df['Item_Weight'].mean()

In [None]:
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

In [None]:
#Replacing the NaN values in the Outlet_Size column with Medium 
df.isnull().sum()

In [None]:
df['Outlet_Size'].value_counts()

In [None]:
df['Outlet_Size'].fillna("Medium", inplace=True)


In [None]:
df.isnull().sum()    ## now we dont have any null values

In [None]:
print(train_data.shape)
print(df.shape)

In [None]:
#Item_Visibility
df[df['Item_Visibility']==0]['Item_Visibility'].count()

In [None]:
df['Item_Visibility'].fillna(df['Item_Visibility'].median(), inplace=True)  

In [None]:
#Outlet Years
df['Outlet_Establishment_Year'].value_counts()

In [None]:
df['Outlet_Years'] = 2009 - df['Outlet_Establishment_Year']
df['Outlet_Years'].describe()

In [None]:
#Item Type
df['Item_Type'].value_counts()

In [None]:
#These items are either Food, Drinks, or Non-Consumable
df['Item_Identifier'].value_counts()

In [None]:
##Changing only the first 2 characters (i,e the category ID)
df['New_Item_Type'] = df['Item_Identifier'].apply(lambda x: x[0:2])

In [None]:
## Rename them to more intuitive categories::
df['New_Item_Type'] = df['New_Item_Type'].map({'FD':'Food','NC':'Non_Consumable','DR':'Drinks'})

df['New_Item_Type'].value_counts()

In [None]:
#If a product is non-consumable then why associate a fat-content to that? we will get rid of this
## Mark non-consumable as separate category in Low-fat.

df.loc[df['New_Item_Type']=="Non_Consumable","Item_Fat_Content"] = "Non-Edible"
df['Item_Fat_Content'].value_counts()

In [None]:
item_visib_avg = df.pivot_table(values='Item_Visibility', index='Item_Identifier')

In [None]:
item_visib_avg

In [None]:
#if a product more visible, then it's likely it eill be getting higher sales.
function = lambda x: x['Item_Visibility']/item_visib_avg['Item_Visibility'][item_visib_avg.index==x['Item_Identifier']][0]

df['item_visib_avg'] = df.apply(function, axis=1).astype(float)

In [None]:
item_visib_avg

In [None]:
#LabelEncoder function
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

df['Outlet'] = label.fit_transform(df['Outlet_Identifier'])
varib = ['Item_Fat_Content','Outlet_Location_Type', 'Outlet_Size','New_Item_Type','Outlet_Type','Outlet']

for i in varib:
    df[i] = label.fit_transform(df[i])

In [None]:
df.head()

In [None]:
#Model Building
df.dtypes

In [None]:
train_data = df.loc[df['source']=='train']
test_data = df.loc[df['source']=='test']

In [None]:
train_data.drop(['source'], axis=1,inplace=True)

In [None]:
test_data.drop(['Item_Outlet_Sales','source'], axis=1,inplace=True)

In [None]:
X_train = train_data.drop(['Item_Outlet_Sales','Item_Identifier','Outlet_Identifier'],axis=1).copy()
y_train = train_data['Item_Outlet_Sales']
X_test = test_data.drop(['Item_Identifier','Outlet_Identifier'], axis=1).copy()


In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

lr.fit(X_train , y_train)

In [None]:
lr_pred = lr.predict(X_test)

In [None]:
lr_pred

In [None]:
lr_accuracy = round(lr.score(X_train,y_train) * 100)
lr_accuracy

In [None]:
#DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)

tree.fit(X_train,y_train)

tree_pred = tree.predict(X_test)

In [None]:
tree_pred

In [None]:
tree_accuracy = round(tree.score(X_train, y_train)*100)
tree_accuracy

In [None]:
#RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=400, max_depth=6, min_samples_leaf = 100,n_jobs=4)

rf.fit(X_train,y_train)

rf_pred = rf.predict(X_test)

rf_accuracy = round(rf.score(X_train,y_train) * 100)
rf_accuracy

In [None]:
#XGBoost Regressor
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=1000, learning_rate = 0.05)
model.fit(X_train,y_train)

In [None]:
pred = model.predict(X_test)
pred

In [None]:
model.score(X_train,y_train)*100