In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor #Algo for Regression Tree Method

superstore = pd.read_csv("sample-superstore.csv",encoding = "ISO-8859-1") #Reading CSV

superstore_encoded = superstore.copy() # Making a copy of the dataframe

# Taking variables that are necessary
superstore_encoded = superstore_encoded[['Row ID','Sales','Ship Mode','Segment','City','State','Postal Code','Region','Category','Sub-Category','Product Name','Quantity','Discount','Profit']]

le = LabelEncoder() # Using a new label encoder instance

In [25]:
superstore_encoded['Ship Mode'] = le.fit_transform(superstore_encoded['Ship Mode'].values) #Transforms Ship Mode into numerical represantation
superstore_encoded['Segment'] = le.fit_transform(superstore_encoded['Segment'].values) 
superstore_encoded['City'] = le.fit_transform(superstore_encoded['City'].values) 
superstore_encoded['State'] = le.fit_transform(superstore_encoded['State'].values)
superstore_encoded['Postal Code'] = le.fit_transform(superstore_encoded['Postal Code'].values) 
superstore_encoded['Region'] = le.fit_transform(superstore_encoded['Region'].values) 
superstore_encoded['Category'] = le.fit_transform(superstore_encoded['Category'].values) 
superstore_encoded['Sub-Category'] = le.fit_transform(superstore_encoded['Sub-Category'].values) 
superstore_encoded['Product Name'] = le.fit_transform(superstore_encoded['Product Name'].values) 


# New Variable from existing variable
superstore_encoded['ProfitPercent'] = round(superstore_encoded['Profit']/superstore_encoded['Sales'],2) 


In [26]:
# Target is sales - Continuous in nature

In [27]:
superstore_encoded.columns

Index(['Row ID', 'Sales', 'Ship Mode', 'Segment', 'City', 'State',
       'Postal Code', 'Region', 'Category', 'Sub-Category', 'Product Name',
       'Quantity', 'Discount', 'Profit', 'ProfitPercent'],
      dtype='object')

In [28]:
feature_cols = ['Ship Mode', 'Segment', 'City', 'State',
       'Postal Code', 'Region', 'Category', 'Sub-Category', 'Product Name',
       'Quantity', 'Discount', 'Profit', 'ProfitPercent']

In [29]:
X = superstore_encoded[feature_cols] # All my features, input to know which is important and which is not
Y = superstore_encoded['Sales'] # My Target Variable

In [30]:
from sklearn.feature_selection import SelectKBest # Algo that selects no. of best features
from sklearn.feature_selection import mutual_info_regression # Function used for SelectKBest for continuous variables

In [31]:
feature_select_model = SelectKBest(score_func = mutual_info_regression, k=4) # Model Instance is created

In [32]:
fit = feature_select_model.fit(X, Y)

In [24]:
print(feature_cols)
print(fit.scores_) # Without ProfitPercent

['Ship Mode', 'Segment', 'City', 'State', 'Postal Code', 'Region', 'Category', 'Sub-Category', 'Product Name', 'Quantity', 'Discount', 'Profit']
[0.         0.         0.03525572 0.05578552 0.02788252 0.0265465
 0.28810122 0.67819705 0.56408743 0.45165453 0.29894168 1.97228258]


In [33]:
print(feature_cols)
print(fit.scores_) # With ProfitPercent

['Ship Mode', 'Segment', 'City', 'State', 'Postal Code', 'Region', 'Category', 'Sub-Category', 'Product Name', 'Quantity', 'Discount', 'Profit', 'ProfitPercent']
[0.00225475 0.         0.03959133 0.05829715 0.02670952 0.02491453
 0.28785739 0.679914   0.56489797 0.45288801 0.29417075 1.96857573
 0.53493825]
