In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pd.options.display.max_columns = 200

In [None]:
dataset = pd.read_csv("C:\\Users\\Sanjay Gupta\\Desktop\\ext\\train.csv")
test = pd.read_csv("C:\\Users\\Sanjay Gupta\\Desktop\\ext\\test.csv")
submission = pd.read_csv("C:\\Users\\HP\\Documents\\ML\\Project\\Sub.csv")

dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
idsUnique = len(set(dataset.User_ID))
idsTotal = dataset.shape[0]
idsDupli = idsTotal - idsUnique
print("There are " + str(idsDupli) + " duplicate IDs for " + str(idsTotal) + " total entries")

In [None]:
plt.figure(figsize=(13,8))
sns.distplot(dataset.Purchase, bins = 25)
plt.xlabel("Amount spent in Purchase")
plt.ylabel("Number of Buyers")
plt.title("Purchase amount Distribution")

In [None]:
print ("Skew is:", dataset.Purchase.skew())
print("Kurtosis: %f" % dataset.Purchase.kurt())

In [None]:
#To check which rows are numbers
numeric_features = dataset.select_dtypes(include=[np.number])
numeric_features.dtypes

In [None]:
sns.countplot(dataset.Occupation)

In [None]:
sns.countplot(dataset.Marital_Status)

In [None]:
sns.countplot(dataset.Product_Category_1)

In [None]:
sns.countplot(dataset.Product_Category_2)

In [None]:
sns.countplot(dataset.Product_Category_3)

In [None]:
#To find the dependencies of Purchase of various attributes
corr = numeric_features.corr()

print (corr['Purchase'].sort_values(ascending=False)[:10],"\n")

# Analysis of columns with high correlation

In [None]:
#correlation matrix
f, ax = plt.subplots(figsize=(14, 7))
sns.heatmap(corr, vmax=.8,annot_kws={'size': 14}, annot=True);

In [None]:
sns.countplot(dataset.Gender)

In [None]:
sns.countplot(dataset.Stay_In_Current_City_Years)

In [None]:
sns.countplot(dataset.City_Category)

In [None]:
Occupation_pivot = \
dataset.pivot_table(index='Occupation', values="Purchase", aggfunc=np.mean)

Occupation_pivot.plot(kind='bar', color='darkorange',figsize=(13,8))
plt.xlabel("Occupation")
plt.ylabel("Purchase")
plt.title("Occupation vs Purchase")
plt.show()

In [None]:
Product_Category_1_pivot=\
dataset.pivot_table(index='Product_Category_1', values="Purchase", aggfunc=np.mean)

Product_Category_1_pivot.plot(kind='bar', color='darkorange',figsize=(11,8))
plt.xlabel("Product_1")
plt.ylabel("Purchase")
plt.title("Product_1 vs Purchase")
plt.show()

In [None]:
roduct_Category_2_pivot=\
dataset.pivot_table(index='Product_Category_2', values="Purchase")

roduct_Category_2_pivot.plot(kind='bar', color='darkgreen',figsize=(11,8))
plt.xlabel("Product_2")
plt.ylabel("Purchase")
plt.title("Product_2 vs Purchase")
plt.show()

In [None]:
Age1=\
dataset.pivot_table(index='Age', values="Purchase", aggfunc=np.mean)
Age1.plot(kind='bar', color='darkgreen',figsize=(10,8))
plt.xlabel("Age")
plt.ylabel("Purchase")
plt.title("Age vs Purchase")
plt.show()

In [None]:
Occupation1 = \
dataset.pivot_table(index='Marital_Status', values="Purchase", aggfunc=np.mean)

Occupation1.plot(kind='bar', color='darkgreen',figsize=(10,8))
plt.xlabel("Marital_Status")
plt.ylabel("Purchase")
plt.title("Marital_Status vs Purchase")
plt.show()

In [None]:
City1 = \
dataset.pivot_table(index='City_Category', values="Purchase", aggfunc=np.mean)

City1.plot(kind='bar', color='darkgreen',figsize=(10,8))
plt.xlabel("City_Category")
plt.ylabel("Purchase")
plt.title("City_Category vs Purchase")
plt.show()

### Data Cleaning

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
dataset['User_ID'] = dataset['User_ID'] - 1000000
test['User_ID'] = test['User_ID'] - 1000000

enc = LabelEncoder()
dataset['User_ID'] = enc.fit_transform(dataset['User_ID'])
test['User_ID'] = enc.transform(test['User_ID'])

In [None]:
dataset['Product_ID'] = dataset['Product_ID'].str.replace('P00', '')
test['Product_ID'] = test['Product_ID'].str.replace('P00', '')

scaler = StandardScaler()
dataset['Product_ID'] = scaler.fit_transform(dataset['Product_ID'].values.reshape(-1, 1))
test['Product_ID'] = scaler.transform(test['Product_ID'].values.reshape(-1, 1))

In [None]:
#Concatenating sets of train and test
dataset['source']='train'
test['source']='test'

data = pd.concat([dataset,test], ignore_index = True, sort = False)

print(dataset.shape, test.shape, data.shape)

In [None]:
data.isnull().sum()/data.shape[0]*100

In [None]:
data["Product_Category_2"]=\
data["Product_Category_2"].fillna(-1.0).astype("float")
data.Product_Category_2.value_counts().sort_index()

In [None]:
data["Product_Category_3"]=\
data["Product_Category_3"].fillna(-1.0).astype("float")

In [None]:
data.Product_Category_3.value_counts().sort_index()

In [None]:
extra = data.index[(data.Product_Category_1.isin([19,20])) & (data.source == "dataset")]
data = data.drop(extra)

In [None]:
#Apply function len(unique()) to every data variable
data.apply(lambda x: len(x.unique()))

In [None]:
category_cols = data.select_dtypes(include=['object']).columns.drop(["source"])
#Print frequency of categories
for col in category_cols:
 #Number of times each value appears in the column
 frequency = data[col].value_counts()
 print("\nThis is the frequency distribution for " + col + ":")
 print(frequency)

In [None]:
data['Gender'],ages = pd.factorize(data['Gender'])
print(ages)
print(data['Gender'].unique())
data["Gender"].value_counts()

In [None]:
data['Age'],ages = pd.factorize(data['Age'])
print(ages)
print(data['Age'].unique())
data["Age"].value_counts()

In [None]:
data['Stay_In_Current_City_Years'],scc = pd.factorize(data['Stay_In_Current_City_Years'])
print(scc)
print(data['Stay_In_Current_City_Years'].unique())
data['Stay_In_Current_City_Years'].value_counts()

In [None]:
data['City_Category'],cc = pd.factorize(data['City_Category'])
print(cc)
print(data['City_Category'].unique())
data['City_Category'].value_counts()

In [None]:
cat_col = ['Gender', 'City_Category']
num_col = ['Age', 'Occupation', 'Stay_In_Current_City_Years', 'Product_Category_1', 
           'Product_Category_2', 'Product_Category_3']

In [None]:
data

In [None]:
encoder = LabelEncoder()
data1=data.copy()

In [None]:
for col in cat_col:
    data1[col] = encoder.fit_transform(data1[col])

In [None]:
scaler = StandardScaler()

for col in num_col:
    data1[col] = scaler.fit(data1[col].values.reshape(-1, 1))

In [None]:
data1

### Count Functions

In [None]:
def getCountVar(compute_df, count_df, var_name):
    grouped_df = count_df.groupby(var_name)
    count_dict = {}
    for name, group in grouped_df:
        count_dict[name] = group.shape[0]
    count_list = []
    for index, row in compute_df.iterrows():
        name = row[var_name]
        count_list.append(count_dict.get(name, 0))
    return count_list

In [None]:
# data["Age_Count"] =getCountVar(data, data, "Age")

In [None]:
# data["Occupation_Count"] =getCountVar(data, data, "Occupation")

In [None]:
# data["Product_Category_1_Count"] =getCountVar(data, data,"Product_Category_1")
# data["Product_Category_2_Count"] =getCountVar(data, data,"Product_Category_2")
# data["Product_Category_3_Count"] =getCountVar(data, data,"Product_Category_3")

In [None]:
# data["Product_ID_Count"] =getCountVar(data, data, "Product_ID")

In [None]:
#Divide into test and train:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

In [None]:
# #Drop unnecessary columns:
test.drop(['source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

#Export files as modified versions:
train.to_csv("C:\\Users\\HP\\Documents\\ML\\Project\\train_modified.csv",index=False)
test.to_csv("C:\\Users\\HP\\Documents\\ML\\Project\\test_modified.csv",index=False)

In [None]:
product_id_res = data.groupby(["Product_ID"])["Purchase"].mean()
avg_cost = data["Purchase"].mean()
# If i find a product id for which i dont have an avg pricing i will use global vg pricing.
product_id_res_map = {}
# created a map with product id to avg price map
val = product_id_res.iteritems()
for key, value in val:
    p_id = str(key)
    product_id_res_map[p_id] = value

In [None]:
def get_purchase_mean(product_id, product_category=None, key=None):
    key_pair = str(product_id)
    key_pair_pid = str(product_id) + str(product_category)
    if key == "1":
        if key_pair_pid in product_category_1_res:
            return product_category_1_res[key_pair_pid]
    elif key == "2":
        if key_pair_pid in product_category_2_res:
            return product_category_2_res[key_pair_pid]
    elif key == "3":
        if key_pair_pid in product_category_3_res:
            return product_category_3_res[key_pair_pid]
    if key_pair in product_id:
         return product_id[key_pair]
    return avg_cost

In [None]:
get_purchase_mean(data.Product_ID)

### Finding models to predict purchase

In [None]:
train_df = pd.read_csv("C:\\Users\\HP\\Documents\\ML\\Project\\train_modified.csv")
test_df = pd.read_csv("C:\\Users\\HP\\Documents\\ML\\Project\\test_modified.csv")

In [None]:
# #Define target and ID columns:
# target = 'Item_Outlet_Sales'
# IDcol = ['Item_Identifier','Outlet_Identifier']

#Define target and ID columns:
target = 'Purchase'
IDcol = ['User_ID','Product_ID']

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics

In [None]:
def commonfit(alg, dtrain, dtest, predictors, target, IDcol, filename):
    #Fitting the algorithm
    alg.fit(dtrain[predictors], dtrain[target])
        
    dtrain_predictions = alg.predict(dtrain[predictors])

    cv_score = cross_val_score(alg, dtrain[predictors],(dtrain[target]) , cv=20, scoring='neg_mean_squared_error')
    cv_score = np.sqrt(np.abs(cv_score))
    
    print("\nModel Report")
    #The value to check is RMSE(parameter)
    print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((dtrain[target]).values, dtrain_predictions)))
    print("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
    
    dtest[target] = alg.predict(dtest[predictors])
    
    IDcol.append(target)
    submission = pd.DataFrame({ x: dtest[x] for x in IDcol})
    submission.to_csv(filename, index=False)

In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression(normalize=True)

predictors = train_df.columns.drop(['Purchase','Product_ID','User_ID'])
commonfit(LR, train_df, test_df, predictors, target, IDcol, 'LR1.csv')

coef1 = pd.Series(LR.coef_, predictors).sort_values()
coef1.plot(kind='bar', title='Model Coefficients')

In [None]:
from sklearn.linear_model import Ridge
RR1 = Ridge(alpha=0.05,normalize=True)
commonfit(RR1, train_df, test_df, predictors, target, IDcol, 'RR.csv')

In [None]:
coef1 = pd.Series(RR1.coef_, predictors).sort_values()
coef1.plot(kind='bar', title='Model Coefficients')

In [None]:
from sklearn.linear_model import Ridge
RR1 = Ridge(alpha=3,normalize=True)
commonfit(RR1, train_df, test_df, predictors, target, IDcol, 'RR1.csv')

In [None]:
coef1 = pd.Series(RR1.coef_, predictors).sort_values()
coef1.plot(kind='bar', title='Model Coefficients')

In [None]:
from sklearn.linear_model import Ridge
RR3 = Ridge(alpha=5,normalize=True)
commonfit(RR3, train_df, test_df, predictors, target, IDcol, 'RR3.csv')

In [None]:
coef1 = pd.Series(RR3.coef_, predictors).sort_values()
coef1.plot(kind='bar', title='Model Coefficients')

In [None]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor(max_depth=15, min_samples_leaf=200)
commonfit(DT, train_df, test_df, predictors, target, IDcol, 'DT.csv')
importances = DT.feature_importances_

In [None]:
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(train_df[predictors].shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
X=train_df[predictors]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="y", align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
RF = DecisionTreeRegressor(max_depth=9, min_samples_leaf=100)
commonfit(RF, train_df, test_df, predictors, target, IDcol,'RF.csv')

In [None]:
importances = RF.feature_importances_

In [None]:
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(train_df[predictors].shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
X=train_df[predictors]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="y", align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

### XBoost 

In [None]:
import sys
!{sys.executable} -m pip install xgboost

In [None]:
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_df[predictors], train_df[target], early_stopping_rounds=5, eval_set=[(test_df[predictors], test_df[target])], verbose=False)

In [None]:
train_df_predictions = my_model.predict(train_df[predictors])

predictions = my_model.predict(test_df[predictors])

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions)))

IDcol.append(target)
submission = pd.DataFrame({ x: test_df[x] for x in IDcol})
submission.to_csv("XGBoost.csv", index=False)

In [None]:
importances = my_model.feature_importances_

In [None]:
indices = np.argsort(importances)[::-1]
print("Feature order:")
for f in range(train_df[predictors].shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
# xgb_reg = XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

# xgb_reg.fit(train, y_train)
# y_pred = xgb_reg.predict(X_val)
# rmse = np.sqrt(mean_squared_error(y_pred, y_val))

# print xgb_reg, rmse

# xgb_reg.fit(X, y)
# predict = xgb_reg.predict(X_test)

# submission['Purchase'] = predict
# submission.to_csv('Sample_Submission_Tm9Lura.csv', index=False)

### More evenly distributed model

In [None]:
X=train_df[predictors]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="y", align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

### Hyperparameter Tuning

### Exhaustive- Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF11=RandomForestRegressor(n_estimators=10, max_depth=3)
RF21=RandomForestRegressor(n_estimators=10, max_depth=10)
RF31=RandomForestRegressor(n_estimators=10, max_depth=20)
RF41=RandomForestRegressor(n_estimators=10, max_depth=40)

In [None]:
RF5=RandomForestRegressor(n_estimators=50, max_depth=3)
# RF6=RandomForestClassifier(n_estimators=50, max_depth=10)
# RF7=RandomForestClassifier(n_estimators=50, max_depth=20)
# RF8=RandomForestClassifier(n_estimators=50, max_depth=40)

In [None]:
RF9=RandomForestRegressor(n_estimators=100, max_depth=3)
# RF10=RandomForestClassifier(n_estimators=100, max_depth=10)
# RF11=RandomForestClassifier(n_estimators=100, max_depth=20)
# RF12=RandomForestClassifier(n_estimators=100, max_depth=40)

In [None]:
RF13=RandomForestRegressor(n_estimators=200, max_depth=3)
# RF14=RandomForestClassifier(n_estimators=200, max_depth=10)
# RF15=RandomForestClassifier(n_estimators=200, max_depth=20)
# RF16=RandomForestClassifier(n_estimators=200, max_depth=40)

In [None]:
RF11

In [None]:
commonfit(RF11, train_df, test_df, predictors, target, IDcol, 'RF11.csv')

In [None]:
RF21.fit(train_df[predictors], train_df[target])
train_df_predictions = RF21.predict(train_df[predictors])
predictions = RF21.predict(test_df[predictors])

In [None]:
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions)))

In [None]:
RF31.fit(train_df[predictors], train_df[target])
train_df_predictions = RF31.predict(train_df[predictors])
predictions = RF31.predict(test_df[predictors])
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions)))

In [None]:
importances = RF31.feature_importances_

In [None]:
indices = np.argsort(importances)[::-1]
print("Feature order:")
for f in range(train_df[predictors].shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
X=train_df[predictors]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="y", align="center")
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
RF41.fit(train_df[predictors], train_df[target])
train_df_predictions1 = RF41.predict(train_df[predictors])
predictions = RF41.predict(test_df[predictors])
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions1)))

In [None]:
importances = RF41.feature_importances_

In [None]:
indices = np.argsort(importances)[::-1]
print("Feature order:")
for f in range(train_df[predictors].shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
X=train_df[predictors]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="y", align="center")
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
RF5.fit(train_df[predictors], train_df[target])
train_df_predictions = RF5.predict(train_df[predictors])
predictions = RF5.predict(test_df[predictors])
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions)))

In [None]:
RF9.fit(train_df[predictors], train_df[target])
train_df_predictions = RF9.predict(train_df[predictors])
predictions = RF9.predict(test_df[predictors])
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions)))

### Extreme Tree

In [None]:
# from sklearn.ensemble import ExtraTreesRegressor
# ET = ExtraTreesRegressor(n_estimators=1450, max_depth=8,min_samples_split=10, min_samples_leaf=10, oob_score=True, n_jobs=6, random_state=123, verbose=1, bootstrap=True)
# ET.fit(train_df[predictors], train_df[target])

In [None]:
train_df_predictions = ET.predict(train_df[predictors])
predictions = ET.predict(test_df[predictors])

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions)))

In [None]:
train_df_predictions = ET.predict(train_df[predictors])

In [None]:
from scipy.stats import expon as exp
from scipy.stats import randint as ri

In [None]:
n_estimators = exp(scale=100)
max_depth = ri(1, 40)
from sklearn.ensemble import RandomForestRegressor
RF1=RandomForestRegressor(n_estimators=20, max_depth=4)

In [None]:
RF1

In [None]:
RF1.fit(train_df[predictors], train_df[target])

In [None]:
train_df_predictions = RF1.predict(train_df[predictors])
predictions = RF1.predict(test_df[predictors])

In [None]:
from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_df[target])))
print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((train_df[target]).values, train_df_predictions)))

### Rule Based Learning

In [None]:
import sys
!{sys.executable} -m pip install skope-rules

In [None]:
# from sklearn.metrics import precision_recall_curve
# from matplotlib import pyplot as plt
# from skrules import SkopeRules

# clf = SkopeRules(max_depth_duplication=None,
#                  n_estimators=30,
#                  precision_min=0.2,
#                  recall_min=0.01,
#                  feature_names=.feature_names)

# X, y = train_df[predictors], train_df[target] > 25
# X_train, y_train = X[:len(y)//2], y[:len(y)//2]
# X_test, y_test = X[len(y)//2:], y[len(y)//2:]
# clf.fit(X_train, y_train)
# y_score = clf.score_top_rules(X_test) # Get a risk score for each test example
# precision, recall, _ = precision_recall_curve(y_test, y_score)
# plt.plot(recall, precision)
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.title('Precision Recall curve')
# plt.show()

### Association Rule Mining

In [None]:
import sys
!{sys.executable} -m pip install mlxtend

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
basket = train_df[(train_df['Product_Category_1'] == 4)]

In [None]:
basket

In [None]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

newtrain = train_df.applymap(encode_units)

In [None]:
newtrain

In [None]:
newtrain=newtrain.dropna()

In [None]:
predictors1 = train_df.columns.drop(['Product_ID','User_ID','Marital_Status','Stay_In_Current_City_Years'])

In [None]:
frequent_itemsets = apriori(newtrain[predictors1], min_support=0.07, use_colnames=True)

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [None]:
rules

In [None]:
rules[ (rules['lift'] > 1.0) &
       (rules['confidence'] > 0.73)]

In [None]:
newtrain1 = basket.applymap(encode_units)

In [None]:
newtrain1

In [None]:
frequent_itemsets = apriori(newtrain1[predictors1], min_support=0.07, use_colnames=True)

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

In [None]:
rules[ (rules['lift'] > 1.02) &
       (rules['confidence'] > 0.966)]

In [None]:
from sklearn import tree
import collections

In [None]:
import sys
!{sys.executable} -m pip install graphviz

In [None]:
import pydotplus

In [None]:
X=train_df[predictors].loc[:2000,]
y=train_df[target].loc[:2000,]
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X,y)

In [None]:
clf

In [None]:
# from sklearn.tree import export_graphviz
# import graphviz
# graphviz.Source(export_graphviz(clf, out_file = None, feature_names = X.columns.tolist()))

In [None]:
import sys
!{sys.executable} -m  pip install git+git://github.com/christophM/rulefit.git

In [None]:
import sys
!{sys.executable} -m  pip install graphviz

In [None]:
# import rulefit
# from rulefit import RuleFit


# y = train_df[target]
# X = train_df[predictors]
# features = X.columns
# X = X.as_matrix()

# rf = RuleFit()
# rf.fit(X, y, feature_names=features)

In [None]:
pd.DataFrame(clf.decision_path(X).toarray()).head(5)

In [None]:
pd.concat([X.reset_index(drop=True),pd.DataFrame(clf.decision_path(X).toarray())],1).head(5)

In [None]:
dTree3 = DecisionTreeRegressor(max_depth = 4)
commonfit(dTree3, train_df, test_df, predictors, target, IDcol, 'DT.csv')

Xrules = pd.concat([X.reset_index(drop=True),pd.DataFrame(dTree3.decision_path(X).toarray()).iloc[:,1:]],1)


from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(Xrules, y)

In [None]:
def find_node(tree_, current_node, search_node, features):
    
    child_left = tree_.children_left[current_node]
    child_right = tree_.children_right[current_node]

    split_feature = str(features[tree_.feature[current_node]])
    split_value = str(tree_.threshold[current_node])


    if child_left != -1:
        if child_left != search_node:
            left_one = find_node(tree_, child_left, search_node, features)
        else:
            return(str(split_feature)+" <= "+str(split_value))
    else:
        return ""

    if child_right != -1:
        if child_right != search_node:
            right_one = find_node(tree_, child_right, search_node, features)
        else:
            return(str(split_feature)+" > "+str(split_value))
    else:
        return ""


    if len(left_one)>0:
        return(str(split_feature)+" <= "+str(split_value)+", "+left_one)
    elif len(right_one)>0:
        return(str(split_feature)+" > "+str(split_value)+","+right_one)
    else:
        return ""

In [None]:
dataset1

In [None]:
find_node(tree_ = clf.tree_, current_node = 0, search_node = 13, features = X.columns.tolist())

In [None]:
dataset[(dataset['Purchase'] >= 10000)]

In [None]:
dfDecisionPath = pd.DataFrame(clf.decision_path(X).toarray())

In [None]:
dfDecisionPath.head()

In [None]:
dTree3 = DecisionTreeRegressor(max_depth = 4)
commonfit(dTree3, train_df, test_df, predictors, target, IDcol, 'DT.csv')

Xrules = pd.concat([X.reset_index(drop=True),pd.DataFrame(dTree3.decision_path(X).toarray()).iloc[:,1:]],1)


from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(Xrules, y)

In [None]:
predictors3=['Product_Category_1','Product_Category_2','Product_Category_3']
store_data=train_df[predictors3]

In [None]:
store_data= store_data.applymap(encode_units)
store_data=store_data.dropna()

In [None]:
store_data

In [None]:
frequent_itemsets = apriori(store_data, min_support=0.07, use_colnames=True)

In [None]:
frequent_itemsets

In [None]:
data

In [None]:
dTree3 = DecisionTreeRegressor(max_depth = 6)
commonfit(dTree3, train_df, test_df, predictors, target, IDcol, 'DT.csv')

Xrules = pd.concat([X.reset_index(drop=True),pd.DataFrame(dTree3.decision_path(X).toarray()).iloc[:,1:]],1)


from sklearn.linear_model import LinearRegression

tree_model = DecisionTreeRegressor()
tree_model.fit(Xrules, y)