In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')


In [None]:
# check for missing values in train dataset
train.isnull().sum()

# check for missing values in test dataset
test.isnull().sum()

# impute missing values in 'Item_Weight' column with mean value
train['Item_Weight'].fillna(train['Item_Weight'].mean(), inplace=True)
test['Item_Weight'].fillna(test['Item_Weight'].mean(), inplace=True)

# impute missing values in 'Outlet_Size' column with mode value
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)
test['Outlet_Size'].fillna(test['Outlet_Size'].mode()[0], inplace=True)


In [None]:
# create a new feature 'Item_Fat_Content_Simplified' based on 'Item_Fat_Content'
train['Item_Fat_Content_Simplified'] = train['Item_Fat_Content'].replace({'LF': 'Low Fat',
                                                                           'low fat': 'Low Fat',
                                                                           'reg': 'Regular'})
test['Item_Fat_Content_Simplified'] = test['Item_Fat_Content'].replace({'LF': 'Low Fat',
                                                                         'low fat': 'Low Fat',
                                                                         'reg': 'Regular'})

# create a new feature 'Outlet_Years' based on 'Outlet_Establishment_Year'
train['Outlet_Years'] = 2013 - train['Outlet_Establishment_Year']
test['Outlet_Years'] = 2013 - test['Outlet_Establishment_Year']


In [None]:
from sklearn.linear_model import LinearRegression

# select relevant features for modeling
X_train = train[['Item_Weight', 'Item_Fat_Content_Simplified', 'Item_Visibility', 'Item_Type', 'Item_MRP',
                 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Years']]
y_train = train['Item_Outlet_Sales']

X_test = test[['Item_Weight', 'Item_Fat_Content_Simplified', 'Item_Visibility', 'Item_Type', 'Item_MRP',
               'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Years']]

# one-hot encode categorical variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# make predictions on test dataset
predictions = lr.predict(X_test)


In [None]:
print(predictions)

[1790.625  1568.125  1880.1875 ... 1832.1875 3624.875  1294.8125]


# **Another more things code**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

# load dataset
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

# impute missing values
train['Item_Weight'].fillna(train['Item_Weight'].mean(), inplace=True)
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0], inplace=True)
test['Item_Weight'].fillna(test['Item_Weight'].mean(), inplace=True)
test['Outlet_Size'].fillna(test['Outlet_Size'].mode()[0], inplace=True)

# feature engineering
train['Item_Fat_Content_Simplified'] = train['Item_Fat_Content'].replace({'LF': 'Low Fat',
                                                                           'low fat': 'Low Fat',
                                                                           'reg': 'Regular'})
test['Item_Fat_Content_Simplified'] = test['Item_Fat_Content'].replace({'LF': 'Low Fat',
                                                                         'low fat': 'Low Fat',
                                                                         'reg': 'Regular'})

train['Outlet_Years'] = 2013 - train['Outlet_Establishment_Year']
test['Outlet_Years'] = 2013 - test['Outlet_Establishment_Year']

# select relevant features for modeling
X_train = train[['Item_Weight', 'Item_Fat_Content_Simplified', 'Item_Visibility', 'Item_Type', 'Item_MRP',
                 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Years']]
y_train = train['Item_Outlet_Sales']

X_test = test[['Item_Weight', 'Item_Fat_Content_Simplified', 'Item_Visibility', 'Item_Type', 'Item_MRP',
               'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Years']]

# one-hot encode categorical variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# split train dataset into train and validation sets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# standard scaling
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_valid_std = scaler.transform(X_valid)
X_test_std = scaler.transform(X_test)

# robust scaling
scaler = RobustScaler()
X_train_rob = scaler.fit_transform(X_train)
X_valid_rob = scaler.transform(X_valid)
X_test_rob = scaler.transform(X_test)

# Hyperparameter Tuning for XGBRegressor
xgb = XGBRegressor()
param_grid = {'n_estimators': [50, 100, 200],
              'learning_rate': [0.01, 0.05, 0.1],
              'max_depth': [3, 5, 7],
              'subsample': [0.5, 0.7, 1],
              'colsample_bytree': [0.5, 0.7, 1]}
grid_xgb_std = GridSearchCV(xgb, param_grid, cv=5)
grid_xgb_std.fit(X_train_std, y_train)

# train final XGBRegressor model with best hyperparameters
xgb_std = XGBRegressor(**grid_xgb_std.best_params_)
xgb_std.fit(X_train_std, y_train)



In [None]:
predictions = xgb_std.predict(X_test_std)
print(predictions)

[1603.1549 1371.7262  619.9247 ... 1903.0029 3426.599  1285.6807]
