# Feature Selection and applying Algorithmns

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [3]:
data = pd.read_csv('clean_training_data.csv')

In [4]:
data

Unnamed: 0.1,Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,P00069042,0,1,10,1,2,0,3,9.0,14.0,8370.0
1,1,P00248942,0,1,10,1,2,0,1,6.0,14.0,15200.0
2,2,P00087842,0,1,10,1,2,0,12,9.0,14.0,1422.0
3,3,P00085442,0,1,10,1,2,0,12,14.0,14.0,1057.0
4,4,P00285442,1,7,16,3,4,0,8,9.0,14.0,7969.0
...,...,...,...,...,...,...,...,...,...,...,...,...
550063,550063,P00372445,1,6,13,2,1,1,20,9.0,14.0,368.0
550064,550064,P00375436,0,3,1,3,3,0,20,9.0,14.0,371.0
550065,550065,P00375436,0,3,15,2,4,1,20,9.0,14.0,137.0
550066,550066,P00375436,0,7,1,3,2,0,20,9.0,14.0,365.0


In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0,P00069042,0,1,10,1,2,0,3,9.0,14.0,8370.0
1,1,P00248942,0,1,10,1,2,0,1,6.0,14.0,15200.0
2,2,P00087842,0,1,10,1,2,0,12,9.0,14.0,1422.0
3,3,P00085442,0,1,10,1,2,0,12,14.0,14.0,1057.0
4,4,P00285442,1,7,16,3,4,0,8,9.0,14.0,7969.0


In [6]:
x = data.drop(['Unnamed: 0','Purchase','Product_ID'],axis = 1)

In [7]:
y = data['Purchase']

In [8]:
feature_select = SelectFromModel(Lasso(alpha= 0.005,random_state=0))
feature_select.fit(x,y)

SelectFromModel(estimator=Lasso(alpha=0.005, random_state=0))

In [9]:
feature_select.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True])

In [10]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = x.columns[(feature_select.get_support())]

# let's print some stats
print('total features: {}'.format((x.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
# print('features with coefficients shrank to zero: {}'.format(
#     np.sum(selected_feat.coef_ == 0)))

total features: 9
selected features: 9


In [11]:
for value in selected_feat:
    print(value)

Gender
Age
Occupation
City_Category
Stay_In_Current_City_Years
Marital_Status
Product_Category_1
Product_Category_2
Product_Category_3


In [12]:
# x = x[selected_feat]

In [13]:
x = x.values

In [14]:
y = y.values

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.3, random_state =0)

## Linear Regression, Decision Tree, Random Forest


In [17]:
#Linear Regression
from sklearn.linear_model import LinearRegression

#Decision Tree
from sklearn.tree import DecisionTreeRegressor

# RANDOM FOREST
from sklearn.ensemble import RandomForestRegressor

In [18]:
#Linear Regression
reg = LinearRegression()

# Decision Tree
dt = DecisionTreeRegressor(max_depth = 10)

# RANDOM FOREST
rf = RandomForestRegressor(n_estimators = 10, random_state = 0)

In [19]:
#Linear Regression
# reg.fit(xtrain,ytrain)


# Decision Tree
dt.fit(xtrain,ytrain)


# RANDOM FOREST
# rf.fit(xtrain,ytrain)

DecisionTreeRegressor(max_depth=10)

In [20]:
#Linear Regression
# ypred  = reg.predict(xtest)

# Decision Tree
ypred  = dt.predict(xtest)

# RANDOM FOREST
# ypred  = rf.predict(xtest)

In [21]:
ypred

array([11325.77394636,  1322.47232472, 16315.09550118, ...,
       10603.74722222, 10556.61502523,  6647.49879324])

In [22]:
from sklearn.metrics import r2_score

In [23]:
score = r2_score(ytest,ypred)

In [24]:
score*100

65.54511941030616

In [25]:
# Check the highest accuracy by changing max_depth value in Decision Tree
l1 =[]
for i in range(1,20):
    dt = DecisionTreeRegressor(max_depth = i)
    dt.fit(xtrain,ytrain)
    y_pred = dt.predict(xtest)
    score = r2_score(ytest,y_pred)
    score = score * 100
    l1.append([i,score])


In [26]:
# Here we can see 65.53 is the highest accuracy when max_depth = 19
print(l1)

[[1, 26.913444336948057], [2, 31.07708764086965], [3, 44.30370553413147], [4, 47.76387919045523], [5, 58.08684509284188], [6, 64.63575379954524], [7, 64.9889133767608], [8, 65.21061827614272], [9, 65.51254676954356], [10, 65.5368987397439], [11, 65.47007962026052], [12, 65.24718458665193], [13, 64.99288387212307], [14, 64.64603972508671], [15, 64.12224860998856], [16, 63.42945532375135], [17, 62.622910567952104], [18, 61.67875000150376], [19, 60.756068799915795]]


In [27]:
l2 =[]
for i in range(1,20):
    rf = RandomForestRegressor(n_estimators = i, random_state = 0)
    rf.fit(xtrain,ytrain)
    y_pred = rf.predict(xtest)
    score = r2_score(ytest,y_pred)
    score = score * 100
    l2.append([i,score])


In [28]:
print(l2)

[[1, 51.964982233816215], [2, 57.79790739034632], [3, 59.71142945305022], [4, 60.55288766730824], [5, 61.198396896082905], [6, 61.56511090083272], [7, 61.811852621942535], [8, 62.037475074281964], [9, 62.168509446690365], [10, 62.31461554141187], [11, 62.40306704063687], [12, 62.493788613682796], [13, 62.584320441524866], [14, 62.62230992654766], [15, 62.661835724294036], [16, 62.71176901933876], [17, 62.73974650777561], [18, 62.78671512156344], [19, 62.81065714527373]]


# Create a pickle file for Model deployement

In [66]:
# Import Library
import pickle


# Save trained Model
file_name = 'complete_model.sav'
pickle.dump(dt,open(file_name,'wb'))