In [None]:
import os
print(os.getcwd())
os.chdir('C:/Users/puran/Desktop/DS material/EDA')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

# 1.Variable Identification 

In [None]:
train.head(10)

In [None]:
train.columns

In [None]:
train.info()

# 2.Uni-Variate Analysis

##### Dependent Variable (Continuous)

In [None]:
train.Item_Outlet_Sales.describe()

In [None]:
plt.hist(train.Item_Outlet_Sales,bins=100,rwidth=1)
plt.xlabel("Item_Outlet_Sales")
plt.ylabel("Count")
plt.show()

As we can see, it is a right skewd variable and would need some data transformation to treat its skewness.

##### Independent Variables (numeric variables)

In [None]:
train[["Item_Weight","Item_Visibility","Item_MRP"]].describe()

We can see from above that Item_Weight has a lot of missing values. While visualizing this variable we need to take note of this point

In [None]:
plt.subplot(221)
plt.hist(train.Item_Weight.dropna(),rwidth=1,bins=150) #rwidth sets width of bars.If it's 1 then the bars will touch each other
plt.xlabel("Item_Weight")
plt.ylabel("count")
plt.subplot(222)
plt.hist(train.Item_Visibility,bins=50) #rwidth sets width of bars.If it's 1 then the bars will touch each other
plt.xlabel("Item_Visibility")
plt.ylabel("count")
plt.subplot(223)
plt.hist(train.Item_MRP,bins=100)
plt.xlabel("Item_MRP")
plt.ylabel("count")
plt.tight_layout()          # this function makes sure the plots dont overlap.Comment this and execute to observe the difference
plt.show()

__Observations__

- There seems to be no clear-cut pattern in Item_Weight.<br/>
- Item_Visibility is right-skewed and should be transformed to curb its skewness.<br/>
- We can clearly see 4 different distributions for Item_MRP. It is an interesting insight.

__Independent Variables (categorical variables)__ 

In [None]:
train["Item_Fat_Content"].value_counts().plot(kind='bar')
plt.show()

In the figure above, ‘LF’, ‘low fat’, and ‘Low Fat’ are the same category and can be combined into one.<br/> Similarly we can be done for ‘reg’ and ‘Regular’ into one. After making these corrections we’ll plot the same figure again

In [None]:
train.loc[train.Item_Fat_Content=="LF","Item_Fat_Content"]      = "Low Fat"
train.loc[train.Item_Fat_Content=="low fat","Item_Fat_Content"] = "Low Fat"
train.loc[train.Item_Fat_Content=="reg","Item_Fat_Content"]     = "Regular"
train["Item_Fat_Content"].value_counts().plot(kind='bar')
plt.show()

In [None]:
#Item_Type
train["Item_Type"].value_counts().plot(kind='bar')
plt.show()

In [None]:
#Outlet Identifier
train["Outlet_Identifier"].value_counts().plot(kind='bar')
plt.show()

In [None]:
train["Outlet_Size"].value_counts(dropna=False).plot(kind='bar')
plt.show()

In [None]:
train["Outlet_Establishment_Year"].value_counts().plot(kind='bar')
plt.show()

In [None]:
train["Outlet_Type"].value_counts().plot(kind='bar')

# 3.Bivariate Analysis

In [None]:
plt.scatter(x=train.Item_Weight,y=train.Item_Outlet_Sales)
plt.xlabel("Item_Weight")
plt.ylabel("Item_Outlet_Sales")
plt.show()

In [None]:
plt.scatter(x=train.Item_Visibility,y=train.Item_Outlet_Sales)
plt.xlabel("Item_Visibility")
plt.ylabel("Item_Outlet_Sales")
plt.show()

In [None]:
plt.scatter(x=train.Item_MRP,y=train.Item_Outlet_Sales)
plt.xlabel("Item_MRP")
plt.ylabel("Item_Outlet_Sales")
plt.show()

__Observations__

- Item_Outlet_Sales is spread well across the entire range of the Item_Weight without any obvious pattern.
- In Item_Visibility vs Item_Outlet_Sales, there is a string of points at Item_Visibility = 0.0 which seems strange as item visibility cannot be completely zero. We will take note of this issue and deal with it in the later stages.
- In the third plot of Item_MRP vs Item_Outlet_Sales, we can clearly see 4 segments of prices that can be used in feature engineering to create a new variable.

In [None]:
### Target Variable vs Independent Categorical Variables ###

In [None]:
sns.boxplot(x=train.Item_Outlet_Sales,y=train.Item_Type)
plt.show()

In [None]:
sns.boxplot(x=train.Item_Fat_Content,y=train.Item_Outlet_Sales)
plt.show()

In [None]:
sns.boxplot(y=train.Outlet_Identifier,x=train.Item_Outlet_Sales)
plt.show()

__Observations__

- Distribution of Item_Outlet_Sales across the categories of Item_Type is not very distinct and same is the case with Item_Fat_Content.
- The distribution for OUT010 and OUT019 categories of Outlet_Identifier are quite similar and very much different from the rest of the categories of Outlet_Identifier.

In [None]:
sns.boxplot(x=train.Outlet_Size,y=train.Item_Outlet_Sales)
plt.show()

In [None]:
sns.boxplot(x=train.Outlet_Location_Type,y=train.Item_Outlet_Sales)
plt.show()

In [None]:
sns.boxplot(y=train.Outlet_Type,x=train.Item_Outlet_Sales)
plt.show()

In [None]:
__Observations__

- Tier 1 and Tier 3 locations of Outlet_Location_Type look similar.
- In the Outlet_Type plot, Grocery Store has most of its data points around the lower sales values as compared to the other categories.

# 4. Missing Value Treatment 

In [None]:
#you can get the # missing values in two ways
print(sum(np.isnan(train.Item_Weight)))
print(train.info())

In [None]:
#We’ll now impute Item_Weight with mean weight based on the Item_Identifier variable.
ind = np.isnan(train.Item_Weight)
for index,row in train.iterrows():
    if ind[index]==True:
        item = row["Item_Identifier"]
        train.loc[index,"Item_Weight"]=(np.mean(train.loc[train.Item_Identifier==item,"Item_Weight"]))

In [None]:
train.Item_Identifier

In [None]:
print(sum(np.isnan(train.Item_Weight)))
#print(train.info())

In [None]:
train["Outlet_Size"].value_counts(dropna=False).plot(kind='bar')
plt.show()

By looking at the distribution we conclude that the missing value's distribution is same as the small's distribution.Therefore we impute the missing values with "Small".Here we will use pandas fillna method to impute the value

In [None]:
train.Outlet_Size=train.Outlet_Size.fillna("Small")
train["Outlet_Size"].value_counts(dropna=False).plot(kind='bar')
plt.show()

# 5.Outlier Treatment

In [None]:
#This is another way of writing the code to impute.The end output is same as what we did for Item_Weight
ind = train.Item_Visibility==0
avg_Item_Visibility = train.groupby('Item_Identifier').Item_Visibility.mean()
#print(avg_Item_Visibility)
print("Number of zero records: "+str(sum(ind)))
for index, row in train.iterrows():
    if ind[index]==True:
        train.loc[index, 'Item_Visibility'] = avg_Item_Visibility[row.Item_Identifier]
ind = train.Item_Visibility==0
print("Number of zero records after imputing: "+str(sum(ind)))

In [None]:
print(avg_Item_Visibility)

# 6&7.Variable Transformation/Feature Engineering

In [None]:
perishable = ["Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood"]

In [None]:
non_perishable = ["Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene", "Household", "Soft Drinks"]

In [None]:
train['Item_Type_New']=np.nan

In [None]:
# create a new feature 'Item_Type_new'
for index, row in train.iterrows():
    if (row["Item_Type"] in perishable):
        train.loc[index,'Item_Type_New']="perishable"
        #print(train.loc[index,"Item_Type_New"])
    elif(row["Item_Type"] in non_perishable):
        train.loc[index,'Item_Type_New']="non_perishable"
    else:
        train.loc[index,'Item_Type_New']="not_sure"
#print(train['Item_Type_New'])

In [None]:
from pandas import Series, DataFrame

import matplotlib.pyplot as plt

%matplotlib inline

# importing linear regression

from sklearn.linear_model import LinearRegression

lreg = LinearRegression()

# for cross validation

from sklearn.model_selection import train_test_split
    
X = train.drop('Item_Outlet_Sales',1)


x_train, x_cv, y_train, y_cv = train_test_split(X,train.Item_Outlet_Sales, test_size =0.3)

# training a linear regression model on train

lreg.fit(x_train,y_train)

# predicting on cv

pred_cv = lreg.predict(x_cv)

# calculating mse

mse = np.mean((pred_cv - y_cv)**2)

# evaluation using r-square

print(lreg.score(x_cv,y_cv))
print(mse)

In [None]:
pred_t = lreg.predict(x_train)
mse = np.mean((pred_t - y_train)**2)
print(lreg.score(x_train,y_train))
print(mse)

In [None]:
sns.reset_orig()
predictors = x_train.columns
coef = Series(lreg.coef_,predictors).sort_values()

coef.plot(kind='bar', title='Modal Coefficients')
plt.show()

In [None]:
from sklearn.linear_model import Lasso

lassoReg = Lasso(alpha=0.05, normalize=True)

lassoReg.fit(x_train,y_train)

pred = lassoReg.predict(x_cv)

# calculating mse

mse = np.mean((pred_cv - y_cv)**2)
print(mse)
print(lassoReg.score(x_cv,y_cv))

In [None]:
sns.reset_orig()
predictors = x_train.columns

coef = Series(lassoReg.coef_,predictors).sort_values()

coef.plot(kind='bar', title='Modal Coefficients')
plt.show()

In [None]:
train=train3

In [None]:
#######Dummy Variable Creation######
ind= np.isnan(train["Item_Weight"])
train=train[-ind]
mylist = list(train.select_dtypes(include=['object']).columns)

dummies = pd.get_dummies(train[mylist], prefix= mylist)

train.drop(mylist, axis=1, inplace = True)

X = pd.concat([train,dummies], axis =1 )

In [None]:
X.head()

In [None]:
from pandas import Series, DataFrame

import matplotlib.pyplot as plt

%matplotlib inline

# importing linear regression

from sklearn.linear_model import LinearRegression

lreg = LinearRegression()

# for cross validation

from sklearn.model_selection import train_test_split

X = X.drop('Item_Outlet_Sales',1)


x_train, x_cv, y_train, y_cv = train_test_split(X,train.Item_Outlet_Sales, test_size =0.3)

# training a linear regression model on train

lreg.fit(x_train,y_train)

# predicting on cv

pred_cv = lreg.predict(x_cv)

# calculating mse

mse = np.mean((pred_cv - y_cv)**2)

# evaluation using r-square

print(lreg.score(x_cv,y_cv))
print(mse)


In [None]:
sns.reset_orig()
predictors = x_train.columns

coef = Series(lreg.coef_,predictors).sort_values()

coef.plot(kind='bar', title='Modal Coefficients')
plt.show()

In [None]:
X.info()

In [None]:
#Choosing the best features
import statsmodels.api as sm
X2 = sm.add_constant(x_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())