In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
train =pd.read_csv("/kaggle/input/big-mart-sales-prediction/Train.csv")

test =pd.read_csv("/kaggle/input/big-mart-sales-prediction/Test.csv")


In [None]:
y = train['Item_Outlet_Sales']
train = train.drop("Item_Outlet_Sales", axis=1)

In [None]:
data =pd.concat([train, test], sort=False)
data.isnull().sum()

In [None]:
data.drop("Item_Identifier", axis=1, inplace=True)
#fill missing values

data ['Item_Weight'] = data['Item_Weight'].fillna(data['Item_Weight'].mean())

data ['Outlet_Size'] = data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0])


In [None]:
data = pd.get_dummies(data, drop_first=True)

In [None]:
# Segregating train and test from data df
train_df=data[:train.shape[0]]
test_df=data[train.shape[0]:]

In [None]:
train_df.shape

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler()

X_scaled_train = X_scaled.fit_transform(train_df)

X_scaled_test = X_scaled.transform(test_df)



from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(X_scaled_train, y, test_size =0.2, random_state =23)


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor

gsc = GridSearchCV(
            estimator=XGBRegressor(),
            param_grid={"learning_rate": (0.05, 0.10, 0.15),
                        "max_depth": [ 3, 4, 5, 6, 8],
                        "min_child_weight": [ 1, 3, 5, 7],
                        "gamma":[ 0.0, 0.1, 0.2],
                        "colsample_bytree":[ 0.3, 0.4],},
            cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

gsc.fit(x_train, y_train)

gsc.best_params_

In [None]:

gsc_xgbr = XGBRegressor(**gsc.best_params_)
gsc_xgbr.fit(x_train, y_train)


In [None]:
gsc_xgbr.score(x_valid,y_valid)

In [None]:
#find important features
print(gsc_xgbr.feature_importances_)

In [None]:
import matplotlib.pyplot as plt

feature_imp_xgb = pd.Series(gsc_xgbr.feature_importances_, index= train_df.columns)
feature_imp_xgb.nlargest(10).plot(kind='barh')

In [None]:
best_feat_xgb = feature_imp_xgb.nlargest(10).index.to_list()

X_reduced = train_df[feature_imp_xgb.nlargest(10).index]

print(X_reduced.columns)

In [None]:
from sklearn.preprocessing import StandardScaler
Xr_scaled = StandardScaler().fit_transform(X_reduced)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(Xr_scaled, y, test_size = 0.30, 
                                                        random_state = 101)

In [None]:

xgbr = XGBRegressor(**gsc.best_params_)
xgbr.fit(Xr_train, yr_train)

xgb_pred = xgbr.predict(Xr_test)



In [None]:
xgbr.score(Xr_test, yr_test)

In [None]:

from sklearn.metrics import r2_score
print ("R2 score:",r2_score(yr_test, xgb_pred))

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(yr_test, xgb_pred))
print('MSE:', metrics.mean_squared_error(yr_test, xgb_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(yr_test, xgb_pred)))

In [None]:
#ridge regression
from sklearn.linear_model import Ridge
ridge_reg = Ridge()
from sklearn.model_selection import GridSearchCV
params_Ridge = {'alpha': [1,0.1,0.01,0.001,0.0001,0] , "fit_intercept": [True, False]}
Ridge_GS = GridSearchCV(ridge_reg, param_grid=params_Ridge)
Ridge_GS.fit(Xr_train, yr_train)

In [None]:
Ridge_GS.best_params_


In [None]:
#ridge regression
ridge_model = Ridge(random_state=3, **Ridge_GS.best_params_)
ridge_model.fit(Xr_train, yr_train)

ridge_pred = ridge_model.predict(Xr_test)


In [None]:
ridge_model.score(Xr_test, yr_test)

In [None]:

from sklearn.metrics import r2_score
print ("R2 score:",r2_score(yr_test, ridge_pred))

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(yr_test, ridge_pred))
print('MSE:', metrics.mean_squared_error(yr_test, ridge_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(yr_test, ridge_pred)))

In [None]:

#ANN Deep learning
#Build a regularized NN (L2 regularization and Dropout)¶
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dense, Dropout, Input 
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.regularizers import l2

In [None]:
def baseline_model():
    model_ann = Sequential()
    
    # Adding the input layer and the first hidden layer
    model_ann.add(Dense(32, activation = 'relu', kernel_initializer='uniform', input_dim = Xr_train.shape[1]))
    model_ann.add(Dropout(0.2))
    
    # Adding the second hidden layer
    model_ann.add(Dense(units = 32,kernel_initializer='uniform', activation = 'relu'))
    model_ann.add(Dropout(0.2))
    
    # Adding the third hidden layer
    model_ann.add(Dense(units = 64,kernel_initializer='uniform', activation = 'relu'))
    model_ann.add(Dropout(0.2))
    
    model_ann.add(Dense(units = 64,kernel_initializer='uniform', activation = 'relu'))
    model_ann.add(Dropout(0.2))
    
    model_ann.add(Dense(units = 128,kernel_initializer='uniform', activation = 'relu'))
    model_ann.add(Dropout(0.2))
    
    model_ann.add(Dense(units = 128,kernel_initializer='uniform', activation = 'relu'))
    model_ann.add(Dropout(0.2))
    
    model_ann.add(Dense(units = 256,kernel_initializer='uniform', activation = 'relu'))
    model_ann.add(Dropout(0.2))
    
    # Adding the output layer
    
    model_ann.add(Dense(units = 1, kernel_initializer='uniform'))
    
    #model.add(Dense(1))
    # Compiling the ANN
    model_ann.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy'])
    
    return model_ann

Model = baseline_model()

Model.summary()

In [None]:
# Fitting the ANN to the Training set
Model.fit(Xr_train, yr_train, batch_size = 20, epochs = 100)

y_pred = Model.predict(Xr_test)

print ("R2 score:",r2_score(yr_test, y_pred))

In [None]:
print ("R2 score:",r2_score(yr_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
plt.plot(yr_test, color = 'red', label = 'Real data')
plt.plot(y_pred, color = 'blue', label = 'Predicted data')
plt.title('Prediction')
plt.legend()
plt.show()

In [None]:
#test and subit data
test_reduced = test_df[feature_imp_xgb.nlargest(10).index]


In [None]:

from sklearn.preprocessing import StandardScaler

scaled = StandardScaler()

scaled_test = scaled.fit_transform(test_reduced)

In [None]:
xgb_test_pred = xgbr.predict(scaled_test)


In [None]:
sample=pd.read_csv("/kaggle/input/big-mart-sales-prediction/Submission.csv")


In [None]:
sample.head()

In [None]:
del sample['Item_Outlet_Sales']


In [None]:

df=pd.DataFrame({'Item_Outlet_Sales':xgb_test_pred})
corr_ans=pd.concat([sample,df],axis=1)
del corr_ans['Unnamed: 0']
corr_ans

In [None]:
corr_ans.to_csv('correct.csv',index=None)
