## Price prediction with linear and ridge regression, Random Forest regressor, SVM regressor

This notebook presents a solution to the [price prediction](https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-guide-regression-analysis-plot-interpretations/practice-problems/machine-learning/predict-the-price-5-fe7f8735/) Hackerrank problem.

It contains two main sections:


1.   Exploratory data analysis and pre-processing
2.   Models fitting and evaluation



## Exploratory data analysis

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


print("Train data shape:", train.shape)
print("Test data shape:", test.shape)

Train data shape: (6368, 15)
Test data shape: (3430, 14)


### Remove negative values of selling prices

In [None]:
train = train[train["Selling_Price"] > 0]

### Converting Loyalty_customer and Product_Category to categorical dtype

In [None]:
train.Loyalty_customer = train.Loyalty_customer.astype("category")
train.Product_Category = train.Product_Category.astype("category")


test.Loyalty_customer = test.Loyalty_customer.astype("category")
test.Product_Category = test.Product_Category.astype("category")

### Drop irrelevant columns

In [None]:
columns = ["Product_id", "instock_date", "Stall_no", "Customer_name"]
train = train.drop(columns=columns)

### Look at correlation

In [None]:
train.corr()

Drop observations with nans

In [None]:
train.isna().sum()

Market_Category       0
Loyalty_customer      0
Product_Category      0
Grade                 0
Demand                0
Discount_avail       37
charges_1           197
charges_2 (%)       204
Minimum_price        38
Maximum_price       340
Selling_Price         0
dtype: int64

In [None]:
train = train.dropna()
train.shape

(5562, 11)

In [None]:
train = pd.get_dummies(train, columns = ["Grade", "Product_Category", "Loyalty_customer"], drop_first = True)

In [None]:
test = pd.get_dummies(test, columns = ["Grade", "Product_Category", "Loyalty_customer"], drop_first = True)

### Train validation splitting

In [None]:
from sklearn.model_selection import train_test_split

# Splitting data into train and validation sets
train_split, valid = train_test_split(train, train_size = 0.8)
train_y =  train_split["Selling_Price"]
train_x = train_split.loc[:, train_split.columns != 'Selling_Price']
valid_y =  valid["Selling_Price"]
valid_x = valid.loc[:, valid.columns != 'Selling_Price']

### Fit the model

In [None]:
from sklearn.linear_model import LinearRegression

lm=LinearRegression()
lm.fit(train_x,train_y)

print("Intercept:", lm.intercept_)

lr_coefficients = list(zip(train.loc[:, train.columns != 'Selling_Price'].columns,lm.coef_))
print("\nNumber of coefficients:",len(lr_coefficients))
print("\nFeature coefficients:",*lr_coefficients, sep='\n')

Intercept: 728.1768816272529

Number of coefficients: 20

Feature coefficients:
('Market_Category', 0.015733333056450284)
('Demand', -0.5663892560760319)
('Discount_avail', -4441.840281091665)
('charges_1', 0.8920797565857775)
('charges_2 (%)', 0.0031153193009197514)
('Minimum_price', 0.4560964974234054)
('Maximum_price', 0.4196419133377333)
('Grade_1', 9.44469326010668)
('Grade_2', 63.5945968177931)
('Grade_3', 71.06532802138415)
('Product_Category_Cosmetics', -61.550566710073525)
('Product_Category_Educational', -38.002060549909004)
('Product_Category_Fashion', 24.401428654634238)
('Product_Category_Home_decor', -34.91271951611848)
('Product_Category_Hospitality', -99.31529300375846)
('Product_Category_Organic', -49.479332560811464)
('Product_Category_Pet_care', -29.511813752410774)
('Product_Category_Repair', -5.013963059246998)
('Product_Category_Technology', -1.0414526184454926)
('Loyalty_customer_Yes', -35.51567732125817)


In [None]:
lr_predicted_train=abs(lm.predict(train_x))
lr_predicted_valid=abs(lm.predict(valid_x))

In [None]:
from sklearn.metrics import mean_absolute_error
import numpy as np


rmse1 = np.sqrt(mean_absolute_error(train_y, lr_predicted_train))
rmse2 = np.sqrt(mean_absolute_error(valid_y, lr_predicted_valid))
print("train_rmse =", rmse1)
print("valid_rmse =", rmse2)

train_rmse = 20.193060897626864
valid_rmse = 20.1343148944008


In [None]:
from sklearn.metrics import mean_squared_log_error


RMSLE = np.sqrt(mean_squared_log_error(train_y,lr_predicted_train))
score1 = max(0,100 - RMSLE)
print("train_score =", score1)
RMSLE = np.sqrt(mean_squared_log_error(valid_y,lr_predicted_valid))
score2 = max(0,100 - RMSLE)
print("valid_score =", score2)

train_score = 99.50886755823052
valid_score = 99.54045613215602


## Ridge regression

In [None]:
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV


lambdas=np.linspace(0.1,10,100)
params={'alpha':lambdas}

model=Ridge(fit_intercept=True)
grid_search=GridSearchCV(model,param_grid=params,cv=10,scoring='neg_mean_absolute_error')
grid_search.fit(train_x, train_y)
model = grid_search.best_estimator_

rg_predicted_train = abs(model.predict(train_x))
rg_predicted_valid = abs(model.predict(valid_x))

In [None]:
ridge_coefficients = list(zip(train.loc[:, train.columns != 'Selling_Price'].columns,model.coef_))
print("\nNumber of coefficients:",len(ridge_coefficients))
print("\nFeature coefficients:",*ridge_coefficients, sep='\n')


Number of coefficients: 20

Feature coefficients:
('Market_Category', 0.014963775916232469)
('Demand', -0.5600243114703553)
('Discount_avail', -4362.7650019811945)
('charges_1', 0.9318538699448187)
('charges_2 (%)', -0.1622791690611319)
('Minimum_price', 0.45577615080532174)
('Maximum_price', 0.41975624539269113)
('Grade_1', 11.180879015513383)
('Grade_2', 61.47799878233105)
('Grade_3', 72.88784027235518)
('Product_Category_Cosmetics', -54.205106180527665)
('Product_Category_Educational', -25.071985879670486)
('Product_Category_Fashion', 28.470946886942514)
('Product_Category_Home_decor', -26.85761342821496)
('Product_Category_Hospitality', -91.47659145240402)
('Product_Category_Organic', -37.68631857503363)
('Product_Category_Pet_care', -16.850157258135)
('Product_Category_Repair', 2.0699983031733815)
('Product_Category_Technology', -3.6015182380219195)
('Loyalty_customer_Yes', -33.61573829361341)


In [None]:
rmse1 = np.sqrt(mean_absolute_error(train_y, rg_predicted_train))
rmse2 = np.sqrt(mean_absolute_error(valid_y, rg_predicted_valid))
print("train_rmse =", rmse1)
print("valid_rmse =", rmse2)

train_rmse = 20.183748079880285
valid_rmse = 20.114374714799364


In [None]:
RMSLE = np.sqrt(mean_squared_log_error(train_y,rg_predicted_train))
score1 = max(0,100 - RMSLE)
print("train_score =", score1)
RMSLE = np.sqrt(mean_squared_log_error(valid_y,rg_predicted_valid))
score2 = max(0,100 - RMSLE)
print("valid_score =", score2)

train_score = 99.50936464919786
valid_score = 99.50281927859962


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

model=RandomForestRegressor(n_jobs=10, n_estimators = 45, min_samples_split=5)
params = {"n_estimators": [500], "min_samples_split": [100], "max_leaf_nodes": [150, 170, 200]}

# grid_search=GridSearchCV(model, param_grid =params,cv=15,scoring='neg_mean_absolute_error')
model.fit(train_x, train_y)
# model = grid_search.best_estimator_

rf_predicted_train = model.predict(train_x)
rf_predicted_valid = model.predict(valid_x)


In [None]:
from sklearn.metrics import mean_absolute_error
import numpy as np

rmse1 = np.sqrt(mean_absolute_error(train_y, rf_predicted_train))
rmse2 = np.sqrt(mean_absolute_error(valid_y, rf_predicted_valid))
print("train_rmse =", rmse1)
print("valid_rmse =", rmse2)

train_rmse = 5.548708209440894
valid_rmse = 7.987653029356898


In [None]:
RMSLE = np.sqrt(mean_squared_log_error(train_y,rf_predicted_train))
score1 = max(0,100 - RMSLE)
print("train_score =", score1)
RMSLE = np.sqrt(mean_squared_log_error(valid_y,rf_predicted_valid))
score2 = max(0,100 - RMSLE)
print("valid_score =", score2)

train_score = 99.9332733334639
valid_score = 99.96906245228196


# Predictions on test set

In [None]:
columns = ["Stall_no", "Product_id", "instock_date",  "Customer_name"]
test_x = test.drop(columns=columns)


In [None]:
for col in test_x.columns:
    if (test_x[col].isnull().sum()>0):
        test_x.loc[test_x[col].isnull(),col]=train[col].mean()

In [None]:
test_predicted = model.predict(test_x)
for i in range(len(test_predicted)):
    if test_predicted[i] < 0:
        test_predicted[i] = 0.0

test_predicted_id = pd.DataFrame(data = test.Product_id)

test_predicted_id["Selling_Price"] = test_predicted

test_predicted_id.to_csv("submission.csv", index = False)