In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score,KFold

In [2]:
# Import data 
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [7]:
item_type_visibility = train.groupby("Outlet_Type").mean()["Item_Visibility"]

print(len(train[train.Item_Visibility==0]))
print(len(train[train.Item_Visibility!=0]))
item_type_visibility

526
7997


Outlet_Type
Grocery Store        0.104862
Supermarket Type1    0.060723
Supermarket Type2    0.061014
Supermarket Type3    0.058615
Name: Item_Visibility, dtype: float64

In [None]:
#sns.distplot(np.log(train.Item_Visibility),kde= False)
item_visibility = train.Item_Visibility
item_visibility[item_visibility==0] = np.mean(train.Item_Visibility)
sns.distplot(item_visibility)

In [None]:
# Create seperate dataframe of item identifier and item weight 
item = train[["Item_Identifier","Item_Weight"]]
item[item.Item_Weight.isnull()]["Item_Identifier"].nunique()
len(item)

In [None]:
# Seperate dataframe of unique items and thier weights 
unique_weights = item.sort_values("Item_Identifier").drop_duplicates().dropna()
unique_weights.head()

In [None]:
item_non_missing = pd.merge(item,unique_weights,how='left',on = "Item_Identifier")

In [None]:
item_non_missing.sort_values("Item_Identifier").isnull().sum()
item_non_missing = item_non_missing[['Item_Identifier','Item_Weight_y']]
item_non_missing.dtypes

In [None]:
#pd.concat(["train","item_non_missing"],axis=1)
train.drop("Item_Identifier",axis=1)

In [None]:
training_Set = pd.concat(objs=[item_non_missing,train.drop(labels="Item_Identifier",axis=1)],axis=1)
training_Set = training_Set.drop(labels=["Item_Weight","Outlet_Identifier"],axis=1)
training_Set.head()

In [None]:
# Missing values for item fat 
def item_fat (x):
    if x=="low fat":
        return "Low Fat"
    elif x=="LF":
        return "Low Fat"
    elif x=="reg":
        return "Regular"
    else:
        return x
    
training_Set['Item_Fat_Content'] = training_Set.Item_Fat_Content.apply(item_fat)

In [None]:
training_Set["Outlet_Size"] = training_Set.Outlet_Size.fillna("Unknown")

In [None]:
training_Set[training_Set.Item_Weight_y.isnull()]

In [None]:
item_type =training_Set.groupby('Item_Type')
item_type.median()

# Frozen foods 12.782404
# snack foods 13.031230
# dairy 13.379905
# baking goods 12.285317

training_Set["Item_Weight_y"].iloc[927] = 12.782404
training_Set["Item_Weight_y"].iloc[1922] = 13.031230
training_Set["Item_Weight_y"].iloc[4187] = 13.379905
training_Set["Item_Weight_y"].iloc[5022] = 12.285317


In [None]:
training_Set["Item_Weight_y"]=np.round(training_Set.Item_Weight_y,2)
training_Set.head()

In [None]:
X = training_Set.drop(labels=['Item_Identifier','Item_Outlet_Sales'],axis=1)
X = pd.get_dummies(X,columns=['Item_Fat_Content',
                              'Item_Type',
                            'Outlet_Size',
                              'Outlet_Location_Type',
                              'Outlet_Type',
                                'Outlet_Establishment_Year'],
                      
               drop_first=True)

y = training_Set.Item_Outlet_Sales

In [None]:
X.head()
num_var = X[["Item_Weight_y",'Item_Visibility','Item_MRP']]
cat_var = X.drop(labels=['Item_Weight_y','Item_Visibility','Item_MRP'],axis=1)

scaler_train = StandardScaler()
scaler_train.fit(num_var)
num_scaled = pd.DataFrame(scaler_train.transform(num_var),columns=num_var.columns)
X = pd.concat([num_scaled,cat_var],axis=1)

In [None]:
X.head()

In [None]:
shuffle = KFold(n_splits=10,shuffle=True)

# Regression Models


### Linear Regression

In [None]:
lm_model = LinearRegression()
lm_scores = cross_val_score(estimator=lm_model,X= X,y=y,cv=shuffle,scoring='neg_mean_squared_error')
print(np.sqrt(np.absolute(lm_scores.mean())))

### Lasso Regression

In [None]:
lasso_model = Lasso()
lasso_scores = cross_val_score(estimator=lasso_model,X= X,y=y,cv=10,scoring='neg_mean_squared_error')
print(np.sqrt(np.absolute(lasso_scores.mean())))

### Ridge Regression 

In [None]:
ridge_model = Ridge()
ridge_scores = cross_val_score(estimator=ridge_model,X= X,y=y,cv=shuffle,scoring='neg_mean_squared_error')
print(np.sqrt(np.absolute(ridge_scores.mean())))

### SVR 

In [None]:
svr_model = SVR(kernel="poly")
svr_scores = cross_val_score(estimator=svr_model,X= X,y=y,cv=shuffle,scoring='neg_mean_squared_error')
print(np.sqrt(np.absolute(svr_scores.mean())))

### Random Forest

In [None]:
rf_model = RandomForestRegressor(n_estimators=500)
rf_scores = cross_val_score(estimator=rf_model,X= X,y=y,cv=shuffle,scoring='neg_mean_squared_error')
print(np.sqrt(np.absolute(rf_scores.mean())))


### Elastic Network

In [None]:
en_model = ElasticNet()
en_scores = cross_val_score(estimator=en_model,X= X,y=y,cv=shuffle,scoring='neg_mean_squared_error')
print(np.sqrt(np.absolute(en_scores.mean())))

### XGBoost 

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor()
xgb_scores = cross_val_score(estimator=xgb_model,X= X,y=y,cv=shuffle,scoring='neg_mean_squared_error')
print(np.sqrt(np.absolute(xgb_scores.mean())))

# Test set

In [None]:
test.head()
len(test)

In [None]:
test.isnull().sum()

In [None]:
# Clean Item_Fat_Content
test["Item_Fat_Content"] = test.Item_Fat_Content.apply(item_fat)

# Clean Outlet_Size
test["Outlet_Size"] = test.Outlet_Size.fillna("Unknown")

In [None]:
# clean item weights
item_test = test[["Item_Identifier","Item_Weight"]]
item_test[item_test.Item_Weight.isnull()]["Item_Identifier"].nunique()
len(item_test)

In [None]:
unique_weights_test = item.sort_values("Item_Identifier").drop_duplicates().dropna()
len(unique_weights)

In [None]:
item_test_non_missing = pd.merge(item_test,unique_weights_test,how='left',on = "Item_Identifier")
item_test_non_missing = item_test_non_missing.drop(labels="Item_Weight_x",axis=1)
item_test_non_missing.isnull().sum()

In [None]:
test_Set = pd.concat(objs=[item_test_non_missing,test.drop(labels="Item_Identifier",axis=1)],axis=1)
test_Set = test_Set.drop(labels=["Item_Weight","Outlet_Identifier"],axis=1)
item_test_type =test_Set.groupby('Item_Type')
item_test_type.mean()

In [None]:
test_Set.Item_Weight_y[test_Set.Item_Identifier=="FDE52"] = 13.03
test_Set.Item_Weight_y[test_Set.Item_Identifier=="FDN52"] = 12.30
test_Set.Item_Weight_y[test_Set.Item_Identifier=="FDQ60"] = 12.38
test_Set.Item_Weight_y[test_Set.Item_Identifier=="FDK57"] = 12.69

In [None]:
test_Set.isnull().sum()

In [None]:
#test_Set = test_Set.drop("Item_Identifier",axis=1)
test_Set_num = test_Set[["Item_Weight_y","Item_Visibility","Item_MRP"]]
test_Set_cat = test_Set.drop(["Item_Weight_y","Item_Visibility","Item_MRP"],axis =1)

scaler_test = StandardScaler()
scaler_test.fit(test_Set_num)

test_Set_cat = pd.get_dummies(data=test_Set_cat,columns=test_Set_cat.columns,drop_first=True)

In [None]:
test_Set_num = pd.DataFrame(scaler_test.transform(test_Set_num),columns=test_Set_num.columns)
test_Set_num

In [None]:
Final_test_set = pd.concat([test_Set_num,test_Set_cat],axis=1)

In [None]:
Final_test_set.head()

In [None]:
xgb_model.fit(X,y)
rf_model.fit(X,y)

In [None]:
#xgb_predict = xgb_model.predict(Final_test_set)
rf_pred = rf_model.predict(Final_test_set)

In [None]:
rf_pred = pd.DataFrame(rf_pred)
rf_pred.to_csv("rfpred.csv")

In [None]:
Final_test_set.head()

In [None]:
test.head()