# IMPORT DATA

In [None]:
import numpy as np 
import pandas as pd 

db = pd.read_csv('/kaggle/input/ecommerce-users-of-a-french-c2c-fashion-store/6M-0K-99K.users.dataset.public.csv')

## 1. Delete unused variables 

In [None]:
repeat_columns = []
# unused and repeated metadata are dropped
repeat_columns += ['identifierHash', 'type','country','gender','civilityTitle']
db1=db.drop(repeat_columns,axis=1)
db1.head()

## 2. Encode variables

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

string_columns = ['language','countryCode','hasAnyApp','hasAndroidApp','hasIosApp','hasProfilePicture']

for var in string_columns:
    var_cat = db[[var]] #use double brakets to make sure i'm taking a dataframe 
    var_cat_encoded = ordinal_encoder.fit_transform(var_cat)
    var_cat_df = pd.DataFrame(var_cat_encoded)
    var_cat_df.columns = [var + '_encoded'] 
    db1 = db1.merge(var_cat_df, how = 'inner', left_index = True, right_index = True)

db2 = db1.drop(string_columns, axis = 1)
db2.head()
db2.info()

In [None]:
#remove variables with no correlations
no_columns=['seniority','seniorityAsMonths','seniorityAsYears']
#week_columns=['language_encoded']
#unused_columns=no_columns+week_columns
db3 = db2.drop(no_columns, axis = 1)

# Predictor Analysis

In [None]:
def models(X_train,y_train,X_test,y_test):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.linear_model import LinearRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error as mae
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import r2_score as r2
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingRegressor as gb
    from sklearn.linear_model import Ridge
    from sklearn.linear_model import Lasso
    from sklearn.svm import SVR
    from sklearn.model_selection import cross_val_score
    import warnings
    warnings.simplefilter('ignore')
    from sklearn.metrics import confusion_matrix , classification_report
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LogisticRegression 
    from sklearn import svm
    import xgboost as xgb
    print('Select 1 : Naive Bayes, 2: Support Vector Machines, 3: Logistic Regression, 4: Decision Tree, 5: RandomForestClassifier,6:Xtreme gradient boosting')
    mo = int(input())
    list=[1,2,3,4,5,6]

    if mo == 1 :
        model = GaussianNB()
    elif mo == 2 :
        model = svm.SVC()
    elif mo == 3 :
        model = LogisticRegression()
    elif mo == 4 :
        model = DecisionTreeClassifier()
    elif mo == 5 :
        model = RandomForestClassifier(random_state=15325)
    elif mo == 6 :
        model = xgb.XGBClassifier()
    else :
        print('Invalid Entry')
    model.fit(X_train,Y_train)
    predict= model.predict(X_test)
    print("testing set accuracy score: ",accuracy_score(Y_test,predict))
    accuracies = cross_val_score(estimator = model , X= X_train , y=Y_train , cv =10)
    print("testing set accuracy mean: ", accuracies.mean())
    print(classification_report(Y_test,predict))
    print("confusion matrix: ")
    print(confusion_matrix(Y_test,predict))
    

In [None]:
#define confusion matrix visualization function
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    import itertools
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
#define accuracy function
def predAcc(x_train,x_test,y_train,y_test,model):
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix , classification_report
    model.fit(x_train,y_train)
    predictTR= model.predict(x_train)
    predictTT= model.predict(x_test)
    print('Accuracy on train set: {:.3f}'.format(accuracy_score(y_train,predictTR)))
    print('Accuracy on test set: {:.3f}'.format(accuracy_score(y_test,predictTT)))
    print('Model Evaluation:')
    print("classification report of train set: ")
    print(classification_report(y_train,predictTR))
    print("classification report of test set: ")
    print(classification_report(y_test,predictTT))
    
    
    print("Confusion matrix of test set: ")
    cnf_matrix = confusion_matrix(y_test, predictTT)
    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, 
                      #classes=['Inactive buyers','Occational buyers','Frequent buyers'],
                      classes=['Inactive sellers','Occational sellers','Frequent sellers'],  
                      title='Confusion matrix  accumulate')
    
    #Y_pred_prob=logreg.predict_proba(X_test)
    #if (func in ['logreg','logregf','rlogreg','rlogregf']):
        #print(f"Coefficient: {func.coef_} ")
        #print(f"intercept: {logreg.intercept_} ")

In [None]:
#define feature selection function
def feaSelect(x_train,y_train,func,x):
    import warnings
    warnings.simplefilter('ignore')
    from sklearn.feature_selection import RFE
    predictors=x_train
    selector=RFE(func,n_features_to_select=1)
    selector=selector.fit(predictors,y_train)
    order=selector.ranking_
    order

    feature_ranks=[]
    for i in order:
        feature_ranks.append(f"{i-1}.{x.columns[i-1]}")
    
    print(feature_ranks)

In [None]:
def org(x,y):
    #split the dataset
    X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.33, random_state=42)
    #rebuild model
    model.fit(X_train,Y_train)
    print("Original Model Evaluation: ")
    predAcc(X_train, X_test, Y_train, Y_test, model)
    print("Original Model's features selection: ")
    feaSelect(X_train,Y_train,model,x)

In [None]:
def filt(xf,yf):
    #re-split the filtered dataset
    Xf_train,Xf_test,Yf_train,Yf_test=train_test_split(xf,yf,test_size=0.33, random_state=42)
    #rebuild model
    model.fit(Xf_train,Yf_train)
    print("Filtered Model Evaluation: ")
    predAcc(Xf_train, Xf_test, Yf_train, Yf_test, model)
    print("Filtered Model's Feature Importance Ranking: ")
    feaSelect(Xf_train,Yf_train,model,xf)

In [None]:
def balanced(x_ros,y_ros):
    #re-split the balanced dataset
    X_ros_train,X_ros_test,Y_ros_train,Y_ros_test=train_test_split(x_ros,y_ros,test_size=0.33, random_state=42)
    #rebuild model
    model.fit(X_ros_train,Y_ros_train)
    print("Balanced Model Evaluation: ")
    predAcc(X_ros_train, X_ros_test, Y_ros_train, Y_ros_test, model)
    print("Balanced Model's feature Selections: ")
    feaSelect(X_ros_train,Y_ros_train,model,x_ros)
    print('Resample training dataset shape', Y_ros_train.shape[0])
    print('Resample testing dataset shape', Y_ros_test.shape[0])

In [None]:
def balancedFilt(xf_ros,yf_ros):
    #re-split the balanced filtered dataset
    Xf_ros_train,Xf_ros_test,Yf_ros_train,Yf_ros_test=train_test_split(xf_ros,yf_ros,test_size=0.33, random_state=42)
    #Rebuild model
    model.fit(Xf_ros_train,Yf_ros_train)
    print("Balanced Filtered Model Evaluation: ")
    predAcc(Xf_ros_train, Xf_ros_test, Yf_ros_train, Yf_ros_test, model)
    print("Balanced Filtered Model's Feature Importance Ranking: ")
    feaSelect(Xf_ros_train,Yf_ros_train,model,xf_ros)

# HasBought As Predictor

Define X and Y 

In [None]:
from sklearn.model_selection import train_test_split
X=db3[['socialNbFollowers','socialNbFollows','socialProductsLiked','productsListed','productsSold','productsPassRate',
          'productsWished','civilityGenderId','daysSinceLastLogin','language_encoded','countryCode_encoded','hasAnyApp_encoded',
          'hasAndroidApp_encoded','hasIosApp_encoded','hasProfilePicture_encoded']]
#Y=db3['HasBought'] = db3['productsBought'].apply(lambda x: '1' if x >0 else '0')
Y=db3['HasBought'] = db3['productsBought'].apply(lambda x: '2' if x >=3 else('1' if x <3 and x>0 else '0'))

Applied two classification models(Naive Bayes, Support Vector Machines), two regression models(Logistic Regression, Decision Tree), and two advanced supervised models(RandomForestClassifier, Xtreme gradient boosting) for model performance.

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.33, random_state=42)
models(X_train,Y_train,X_test,Y_test)

# Improve two models with the best performance

# Logistic Regression

Model Built

In [None]:
#define logistic regression function
import numpy as np
from sklearn.linear_model import LogisticRegression
model= LogisticRegression(multi_class='multinomial',solver='newton-cg') #other solvers not converge

Model Evaluation

In [None]:
org(X,Y)

F1-score of both train and test set is extremely low

In [None]:
#according to the correlation heatmap, pairplot and feature selection, keep top 7
 
d_columns=['productsSold', 'hasAnyApp_encoded', 'productsPassRate', 'hasProfilePicture_encoded', 
           'productsListed', 'socialNbFollows', 'socialProductsLiked', 'socialNbFollowers']

#socialNBFollows & socialProductsLiked has no relationship with a user's willingness to buy a product
Xf = X.drop(d_columns, axis = 1)
Yf = db3['HasBought']
Xy=db3[['productsBought']]
Xf_pp = pd.concat([Xf,Xy], join = 'outer', axis = 1) 

Visualize the relationship between individual variable and dependent variable

In [None]:
import seaborn as sns
sns.pairplot(Xf_pp, x_vars=Xf.columns, y_vars='productsBought', height=7, aspect=0.7, kind='reg')

No strong linear relationship, only week linear relationship on productsWished and productsBought

Model Re-evaluation with the Filtered Dataset

In [None]:
filt(Xf,Yf)

 In logistic regression, if the intercept is below 1 implies a reduction in the probability that the event happens. To sum up:
b) logit negative value = logistic < 1 = decrease in the probability of the event when you have a positive change in the independent variables

New users' prediction

In [None]:
c=Xf
c.dataframeName = "Formean"
c.describe()

In [None]:
#try to predict when a user using mean
newdata=[[2,2,581,2,94,0,0]]
Y_pred1 = model.predict(newdata)
print("prediction with mean features: ",Y_pred1)
#socialNbFollowers affect the prediction the most

#try to predict when a user using max
newdata1=[[2635,3,709,4,198,1,1]]
Y_pred2=model.predict(newdata1)
print("prediction with max features: ", Y_pred2)

In [None]:
print(db3['HasBought'].value_counts())
#very imbalanced sample

## Bootstrap to resample the imbalanced data

Re-split the balanced dataset

In [None]:
from collections import Counter
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=42)

 # fit predictor and target variable
X_ros, Y_ros = ros.fit_resample(X, Y)
print('Original dataset shape', Counter(Y))
print('Resample dataset shape', Counter(Y_ros))

X_ros=pd.DataFrame(X_ros,columns=['socialNbFollowers', 'socialNbFollows', 'socialProductsLiked',
       'productsListed', 'productsSold', 'productsPassRate', 'productsWished',
       'civilityGenderId', 'daysSinceLastLogin', 'language_encoded',
       'countryCode_encoded', 'hasAnyApp_encoded', 'hasAndroidApp_encoded',
       'hasIosApp_encoded', 'hasProfilePicture_encoded'])
Y_ros.to_frame() 
#Y_ros=pd.DataFrame(Y_ros,columns=['HasBought'])

Balanced Model Evaluation

In [None]:
balanced(X_ros,Y_ros)

We could clearly observe more balanced f1-score that the f1-score of case 'HasBought'=1 increased significantly although 0's dropped slightly.

Re-split the filtered dataset

In [None]:
#according to the correlation heatmap, pairplot and feature selection, keep top 7
d_columns=[ 'productsPassRate', 'hasAndroidApp_encoded', 'daysSinceLastLogin', 'hasProfilePicture_encoded', 
           'productsListed', 'socialProductsLiked', 'socialNbFollows', 'socialNbFollowers']
#socialNBFollows & socialProductsLiked has no relationship with a user's willingness to buy a product
Xf_ros = X_ros.drop(d_columns, axis = 1)
Yf_ros = Y_ros

Balanced Model Re-evaluation with Filtered Dataset

In [None]:
balancedFilt(Xf_ros,Yf_ros)

Updated New users’ prediction

In [None]:
c=Xf_ros
c.dataframeName = "Formean"
c.describe()

In [None]:
#try to predict when a user using mean
newdata=[[1,22,2,2,90,0,0]]
Y_pred1 = model.predict(newdata)
print("prediction with mean features: ",Y_pred1)
#socialNbFollowers affect the prediction the most

#try to predict when a user using max
newdata1=[[174,2635,3,4,198,1,1]]
Y_pred2=model.predict(newdata1)
print("prediction with max features: ", Y_pred2)

They both used countryCode_encoded, language_encoded, hasIosApp_encoded, productsWished, civilityGenderId
Imbalanced used daysSinceLast Login, hasAndroidApp_encoded
Balanced used productsSold

Balanced dataset prediction has significantly higher f1socre on HasBought=1   

# Extreme Gradient Boosting

Model Built

In [None]:
import xgboost as xgb
model= xgb.XGBClassifier()

Model Evaluation

In [None]:
org(X,Y)

F1-score of both train and test set increased significantly comparing to LR's

Re-split the filtered dataset

In [None]:
#according to the correlation heatmap, pairplot and feature selection, keep top 7

d_columns=[ 'productsSold', 'socialNbFollowers', 'language_encoded', 'hasAndroidApp_encoded', 'productsListed', 
           'hasAnyApp_encoded', 'daysSinceLastLogin', 'hasProfilePicture_encoded']

#socialNBFollows & socialProductsLiked has no relationship with a user's willingness to buy a product
Xf = X.drop(d_columns, axis = 1)
Yf = db3['HasBought']
Xy=db3[['productsBought']]
Xf_pp = pd.concat([Xf,Xy], join = 'outer', axis = 1) 

Visualize the relationship between individual variable and dependent variable

In [None]:
import seaborn as sns
sns.pairplot(Xf_pp, x_vars=Xf.columns, y_vars='productsBought', height=7, aspect=0.7, kind='reg')

Model Re-evaluation with the Filtered Dataset

In [None]:
filt(Xf,Yf)

New users' prediction

In [None]:
c=Xf
c.dataframeName = "Formean"
c.describe()

In [None]:


#try to predict when a user using mean and max
nUser = {'socialNbFollows': [8,13764],
        'socialProductsLiked': [4,51671],
        'productsPassRate':[1,100],
        'productsWished':[2,2635],
        'civilityGenderId':[2,3],
        'countryCode_encoded':[94,198],
        'hasIosApp_encoded' :[0,1]  
        }

df2 = pd.DataFrame(nUser, columns = ['socialNbFollows','socialProductsLiked','productsPassRate','productsWished','civilityGenderId',
                'countryCode_encoded','hasIosApp_encoded'])

Y_pred2 = model.predict(df2)
Y_pred2


In [None]:
print(db3['HasBought'].value_counts())
#very imbalanced sample

## Bootstrap to resample the imbalanced data

Balanced Model Evaluation

In [None]:
balanced(X_ros,Y_ros)

In [None]:
#according to the correlation heatmap, pairplot and feature selection, keep top 7
d_columns=[ 'productsPassRate', 'socialNbFollowers', 'countryCode_encoded', 'daysSinceLastLogin', 
           'productsListed', 'hasAnyApp_encoded', 'productsWished', 'language_encoded']
Xf_ros = X_ros.drop(d_columns, axis = 1)
Yf_ros = Y_ros

Balanced Model Re-evaluation with the Filtered Dataset

In [None]:
balancedFilt(Xf_ros,Yf_ros)

New User's Prediction

In [None]:
c=Xf_ros
c.dataframeName = "Formean"
c.describe()

In [None]:
#try to predict when a user using mean and max
nUser = {'socialNbFollows': [10,13764],
        'socialProductsLiked': [47,51671],
        'productsSold':[1,174],
        'civilityGenderId':[2,3],
        'hasAndroidApp_encoded':[0,1],
        'hasIosApp_encoded':[0,1],
        'hasProfilePicture_encoded' :[1,1]  
        }

df2 = pd.DataFrame(nUser, columns = ['socialNbFollows','socialProductsLiked','productsSold','civilityGenderId',
                                     'hasAndroidApp_encoded','hasIosApp_encoded', 'hasProfilePicture_encoded'])

Y_pred2 = model.predict(df2)
Y_pred2



For filtered dataset(both imbalanced and balanced):
The both used productsWished, civilityGenderId, countryCode_encoded, hasIosApp_encoded
Imbalanced used socialProductsLiked, socialNbFollows, productsPassRate
Balanced used productsSold, language_encoded, hasAnyApp_encoded

Balanced dataset prediction has significantly higher f1socre on HasBought=1

# HasSold As Predictor

Define X and Y

In [None]:
from sklearn.model_selection import train_test_split
X=db3[['socialNbFollowers','socialNbFollows','socialProductsLiked','productsListed','productsBought','productsPassRate',
          'productsWished','civilityGenderId','daysSinceLastLogin','language_encoded','countryCode_encoded','hasAnyApp_encoded',
          'hasAndroidApp_encoded','hasIosApp_encoded','hasProfilePicture_encoded']]
Y=db3['HasSold'] = db3['productsSold'].apply(lambda x: '2' if x >=6 else('1' if x <6 and x>0 else '0'))

Applied two classification models(Naive Bayes, Support Vector Machines), two regression models(Logistic Regression, Decision Tree), and two advanced supervised models(RandomForestClassifier, Xtreme gradient boosting) for model performance.

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.33, random_state=42)
models(X_train,Y_train,X_test,Y_test)

# Improve model with the best performance
since all have almost the same accuracy on the test set

Model Built

In [None]:
import xgboost as xgb
model= xgb.XGBClassifier()

Model Evaluation

In [None]:
org(X,Y)

Re-split the filtered dataset

In [None]:
#according to the correlation heatmap, pairplot and feature selection, keep top 7
d_columns=[ 'productsBought', 'productsListed', 'productsPassRate', 'civilityGenderId', 'language_encoded',
           'hasAndroidApp_encoded', 'hasProfilePicture_encoded', 'productsWished']

#socialNBFollows & socialProductsLiked has no relationship with a user's willingness to buy a product
Xf = X.drop(d_columns, axis = 1)
Yf = db3['HasSold']
Xy=db3[['productsSold']]
Xf_pp = pd.concat([Xf,Xy], join = 'outer', axis = 1)

Visualize the relationship between individual variable and dependent variable

In [None]:
import seaborn as sns
sns.pairplot(Xf_pp, x_vars=Xf.columns, y_vars='productsSold', height=7, aspect=0.7, kind='reg')

Model Re-evaluation with the Filtered Dataset

In [None]:
filt(Xf,Yf)

f1-score on HasSold=1 dropped significantly on filtered dataset

New users' prediction

In [None]:
c=Xf
c.dataframeName = "Formean"
c.describe()

In [None]:
#try to predict when a user using mean and max
nUser = {'socialNbFollowers': [3,744],
        'socialNbFollows': [8,13764],
        'socialProductsLiked':[4,51671],
        'daysSinceLastLogin':[581,709],
        'countryCode_encoded':[94,198],
        'hasAnyApp_encoded':[0,1],
        'hasIosApp_encoded' :[0,1]  
        }

df2 = pd.DataFrame(nUser, columns = ['socialNbFollowers','socialNbFollows','socialProductsLiked','daysSinceLastLogin',
                                     'countryCode_encoded','hasAnyApp_encoded','hasIosApp_encoded'])

Y_pred2 = model.predict(df2)
Y_pred2


## Bootstrap to resample the imbalanced data

In [None]:
from collections import Counter
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=42)

 # fit predictor and target variable
X_ros, Y_ros = ros.fit_resample(X, Y)
print('Original dataset shape', Counter(Y))
print('Resample dataset shape', Counter(Y_ros))

X_ros=pd.DataFrame(X_ros,columns=['socialNbFollowers', 'socialNbFollows', 'socialProductsLiked',
       'productsListed', 'productsBought', 'productsPassRate', 'productsWished',
       'civilityGenderId', 'daysSinceLastLogin', 'language_encoded',
       'countryCode_encoded', 'hasAnyApp_encoded', 'hasAndroidApp_encoded',
       'hasIosApp_encoded', 'hasProfilePicture_encoded'])
Y_ros.to_frame()
#Y_ros=pd.DataFrame(Y_ros,columns=['HasSold'])

Balanced Model Evaluation

In [None]:
balanced(X_ros,Y_ros)

In [None]:
#according to the correlation heatmap, pairplot and feature selection, keep top 7

d_columns=[ 'productsWished', 'productsListed', 'productsPassRate', 'daysSinceLastLogin',
           'hasIosApp_encoded', 'hasProfilePicture_encoded', 'hasAndroidApp_encoded', 'productsBought']

Xf_ros = X_ros.drop(d_columns, axis = 1)
Yf_ros = Y_ros

Balanced Model Re-evaluation with the Filtered Dataset

In [None]:
balancedFilt(Xf_ros,Yf_ros)

overall accuracy and f1-score on HasSold=1 decreased on filtered dataset for both balanced and imbalanced cases.
Especially the f1-score on HasSold=1 decreased significantly on imbalanced cases.

New User's Prediction

In [None]:
c=Xf_ros
c.dataframeName = "Formean"
c.describe()

In [None]:
#try to predict when a user using mean and max
nUser = {'socialNbFollowers': [13,744],
        'socialNbFollows': [31,13764],
        'socialProductsLiked':[78,51671],
        'civilityGenderId':[2,3],
        'language_encoded':[2,4],
        'countryCode_encoded':[86,198],
        'hasAnyApp_encoded' :[1,1]  
        }

df2 = pd.DataFrame(nUser, columns = ['socialNbFollowers','socialNbFollows','socialProductsLiked','civilityGenderId',
                                     'language_encoded','countryCode_encoded','hasAnyApp_encoded'])

Y_pred2 = model.predict(df2)
Y_pred2


In [None]:
For filtered dataset(both imbalanced and balanced):
They both had variables socialNbFollwers, countryCode_encoded, socialNbFollows, socialProductsLiked, hasAnyApp_encoded
Imbalanced dataset used variables daysSinceLastLogin, hasIosApp_encoded
Balanced dataset used language_encoded, civilityGenderId

They chosed almost the same varibles for model building
But balanced dataset prediction only has slightly lower f1socre on HasBought=1