In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

# Filling Missing Value

In [None]:
# Numerical -> mean , Item_Weight  
# Categorical-> mode, Outlet_Size

In [None]:
data['Item_Weight'].fillna(data['Item_Weight'].mean(), inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data['Outlet_Size'].unique()

In [None]:
data['Outlet_Size'].mode()

In [None]:
data['Outlet_Type'].unique()

In [None]:
mode_of_outlet_size = data.pivot_table(values='Outlet_Size', columns = 'Outlet_Type', aggfunc = lambda x : x.mode()[0])

In [None]:
mode_of_outlet_size

In [None]:
missing_value = data['Outlet_Size'].isnull()

In [None]:
missing_value.value_counts()

In [None]:
missing_value = data['Outlet_Size'].isnull()
data.loc[missing_value, 'Outlet_Size'] = data.loc[missing_value, 'Outlet_Type'].apply(lambda x: mode_of_outlet_size[x])

In [None]:
data.isnull().sum()

In [None]:
data.describe()

# Exploratory Data Analysis

In [None]:
# Item_Weight Distribution
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(data['Item_Weight'])
plt.show()

In [None]:
# Item_Visibility Distribution
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(data['Item_Visibility'])
plt.show()

In [None]:
# Item_MRP Distribution
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(data['Item_MRP'])
plt.show()

In [None]:
# Outlet_Establishment_Year Count
sns.set()
plt.figure(figsize=(6,6))
sns.countplot(data['Outlet_Establishment_Year'])
plt.show()

In [None]:
# Item_Outlet_Sales Distribution
sns.set()
plt.figure(figsize=(6,6))
sns.distplot(data['Item_Outlet_Sales'])
plt.show()

In [None]:
data.columns

In [None]:
data.select_dtypes(include='object').columns

In [None]:
# Item_Fat_Content Count
sns.set()
plt.figure(figsize=(6,6))
sns.countplot(data['Item_Fat_Content'])
plt.show()

In [None]:
# Item_Type Count
sns.set()
plt.figure(figsize=(30,6))
sns.countplot(data['Item_Type'])
plt.show()

In [None]:
# Outlet_Identifier Count
sns.set()
plt.figure(figsize=(30,6))
sns.countplot(data['Outlet_Identifier'])
plt.show()

In [None]:
# Outlet_Size Count
sns.set()
plt.figure(figsize=(6,6))
sns.countplot(data['Outlet_Size'])
plt.show()

In [None]:
# Outlet_Location_Type Count
sns.set()
plt.figure(figsize=(6,6))
sns.countplot(data['Outlet_Location_Type'])
plt.show()

In [None]:
# Outlet_Type Count
sns.set()
plt.figure(figsize=(10,6))
sns.countplot(data['Outlet_Type'])
plt.show()

In [None]:
data.replace({'Item_Fat_Content': {'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg':'Regular'}}, inplace=True)

In [None]:
# Item_Fat_Content Count
sns.set()
plt.figure(figsize=(6,6))
sns.countplot(data['Item_Fat_Content'])
plt.show()

In [None]:
data.select_dtypes(include='object').columns

# Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
encoder = LabelEncoder()

In [None]:
data['Item_Identifier']  = encoder.fit_transform(data['Item_Identifier'])
data['Item_Fat_Content']  = encoder.fit_transform(data['Item_Fat_Content'])
data['Item_Type']  = encoder.fit_transform(data['Item_Type'])
data['Outlet_Identifier']  = encoder.fit_transform(data['Outlet_Identifier'])
data['Outlet_Size']  = encoder.fit_transform(data['Outlet_Size'])
data['Outlet_Location_Type']  = encoder.fit_transform(data['Outlet_Location_Type'])
data['Outlet_Type']  = encoder.fit_transform(data['Outlet_Type'])

In [None]:
data.head()

# Spliting data into train & test part

In [None]:
X = data.drop(columns = 'Item_Outlet_Sales', axis = 1)
y = data['Item_Outlet_Sales']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
metrics.r2_score(y_test, y_pred)

In [None]:
data.info()

In [None]:
data.sample(4)

In [None]:
input1 = (743,18.600,0,0.039356,5,246.3802,9,1999,1,0,1)
new_input = np.asanyarray(input1, dtype = float)
prediction = model.predict(new_input.reshape(1, -1))
print(prediction)

In [None]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))