In [92]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV

In [93]:
data = pd.read_csv('./src/conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


In [5]:
data.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


In [6]:
# The dataset is quite big : you must create a sample of the dataset before making any visualizations !
data_sample = data.sample(10000)

## EDA

The target is 'converted'

In [32]:
fig1 = px.histogram(data_sample, x='country', y='converted')
fig2 = px.histogram(data_sample, x='age', y='converted')
fig3 = px.histogram(data_sample, x='new_user', y='converted')
fig4 = px.histogram(data_sample, x='source', y='converted')
fig5 = px.histogram(data_sample, x='total_pages_visited', y='converted')

In [35]:
fig = make_subplots(rows=5, cols=1, subplot_titles=('country', 'age', 'new_user', 'source', 'total_page_visited'))

In [36]:
# could loop
for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig2.data:
    fig.add_trace(trace, row=2, col=1)

for trace in fig3.data:
    fig.add_trace(trace, row=3, col=1)

for trace in fig4.data:
    fig.add_trace(trace, row=4, col=1)

for trace in fig5.data:
    fig.add_trace(trace, row=5, col=1)

In [39]:

fig.update_layout(title_text="Nb. of conversion per : ")

In [10]:
px.histogram(data_sample, x='converted', y='total_pages_visited')

In [43]:
data.isna().sum()

country                0
age                    0
new_user               0
source                 0
total_pages_visited    0
converted              0
dtype: int64

## Training

In [41]:
features_list = ['total_pages_visited']
numeric_indices = [0]
categorical_indices = []
target_variable = 'converted'

X = data.loc[:, features_list]
y = data.loc[:, target_variable]

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify = y)

Features are numerical values with no NA. Therefore, we only need to perform scaling as preprocessing. Target is categorical but already encoded as boolean. no further preprocessing needed

In [44]:
scaler = StandardScaler()

In [45]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Target is categorical so we need a classifier. I ma training the baseline model with a logisitic regression first. 

In [46]:
lr = LogisticRegression()

In [47]:
lr.fit(X_train, y_train)

In [48]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)


In [49]:
print('The train score is : {}'.format(lr.score(X_train, y_train)))
print('The test score is : {}'. format(lr.score(X_test, y_test)))

The train score is : 0.9829495318637212
The test score is : 0.9830276196500105


High score and no overfitting

In [52]:
print('The f1-score is : {}'.format(f1_score(y_train, y_train_pred)))
print('The f1-score is : {}'. format(f1_score(y_test, y_test_pred)))

The f1-score is : 0.6951908983039017
The f1-score is : 0.6944971537001897


Refining the logistic regression parameters

Now I will try to improve the score by using a decision tree and refining parameters using grid search

## Decision Tree

In [61]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
classifier_g = DecisionTreeClassifier(criterion='gini', random_state=42)

In [107]:
params = {
    "max_depth":[1,2,3,4]
}

test

In [108]:
grid_g = GridSearchCV(classifier_g, param_grid=params, cv=10)
grid_g.fit(X_test, y_test)

In [109]:
best_model_g = grid_g.best_estimator_
best_model_g

In [110]:
best_pred_g = best_model_g.predict(X_test)
print(classification_report(y_test, best_pred_g))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     27540
           1       0.83      0.60      0.69       918

    accuracy                           0.98     28458
   macro avg       0.91      0.80      0.84     28458
weighted avg       0.98      0.98      0.98     28458



train

In [111]:
grid_g = GridSearchCV(classifier_g, param_grid=params, cv=10)
grid_g.fit(X_train, y_train)

In [112]:
best_model_g = grid_g.best_estimator_
best_model_g

In [114]:
best_pred_g = best_model_g.predict(X_train)
print(classification_report(y_train, best_pred_g))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    247860
           1       0.82      0.60      0.70      8262

    accuracy                           0.98    256122
   macro avg       0.90      0.80      0.84    256122
weighted avg       0.98      0.98      0.98    256122



## Multi features

In [4]:
from sklearn.svm import SVC

In [6]:
data.head(0)

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted


In [7]:
features_list = ['total_pages_visited', 'new_user']
target_variable = 'converted'

X = data.loc[:, features_list]
y = data.loc[:, target_variable]

In [8]:
numerical_features = ['total_pages_visited']
categorical_features = ['new_user']

In [9]:
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify = y)

In [11]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

## Random Forest

In [114]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

## Passer au regressor pour pouvoir du lasso ensuite

In [138]:
rfc = RandomForestClassifier(n_estimators=150, n_jobs=-1, criterion='entropy', random_state=42)


In [None]:
#faire une grid search CV, ici

In [139]:

rfc.fit(X_train, y_train)

In [140]:
y_train_pred_rfc = rfc.predict(X_train)

print(classification_report(y_train, y_train_pred_rfc))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    247860
           1       0.89      0.73      0.80      8262

    accuracy                           0.99    256122
   macro avg       0.94      0.86      0.90    256122
weighted avg       0.99      0.99      0.99    256122



In [32]:
y_test_pred_rfc = rfc.predict(X_test)

print(classification_report(y_test, y_test_pred_rfc))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     27540
           1       0.85      0.64      0.73       918

    accuracy                           0.98     28458
   macro avg       0.92      0.82      0.86     28458
weighted avg       0.98      0.98      0.98     28458



Great scores overal, could be imporved with X selection (lasso) and gridsearch CV = Next step.

Note : Others had better score with optimized logistic regressions, let's see how it compares with my refined RF

## Lasso for multiple feature selection

In [143]:
data.head(2)

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0


In [229]:
features_list

['country', 'age', 'new_user', 'source', 'total_pages_visited']

In [167]:
X = data[features_list]
y = data[target_variable]

# X = data.loc[:, features_list]
# y = data.loc[:, target_variable]

In [168]:
numerical_features = ['age', 'total_pages_visited', 'new_user']
categorical_features = ['country', 'source']

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [170]:
print(X_train.shape)
print(X_test.shape)

(227664, 5)
(56916, 5)


In [171]:
'''
The standard pipeline is transforming X.shape from 5 to 8 columns, problem
'''

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) 
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [172]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

One hot encoding (dropping the first value to avoid dummy variables) = creates a df with 8 columns and not 5 anymore (3 for country, 2 for source)

In [173]:
print(X_train.shape)
print(X_test.shape)

(227664, 8)
(56916, 8)


In [174]:
# initializing a LR with a l1 penalty score (lasso) -> check what is saga, I had to change the solver, grid search was not working with default
logistic_lasso = LogisticRegression(penalty='l1', solver = 'saga')

In [175]:
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0]  # Different regularization strengths
} 

grid_search = GridSearchCV(estimator=logistic_lasso, param_grid=param_grid)

In [176]:
grid_search.fit(X_train, y_train)

In [194]:
best_lasso = grid_search.best_estimator_
best_lasso

In [222]:
y_train_pred = best_lasso.predict(X_train)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    220320
           1       0.85      0.69      0.76      7344

    accuracy                           0.99    227664
   macro avg       0.92      0.84      0.88    227664
weighted avg       0.99      0.99      0.99    227664



In [223]:
y_test_pred = best_lasso.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     55080
           1       0.87      0.69      0.77      1836

    accuracy                           0.99     56916
   macro avg       0.93      0.84      0.88     56916
weighted avg       0.99      0.99      0.99     56916



In [224]:
print('The f1-score is : {}'.format(f1_score(y_train, y_train_pred)))
print('The f1-score is : {}'. format(f1_score(y_test, y_test_pred)))

The f1-score is : 0.7618832050701675
The f1-score is : 0.7682520448348985


In [204]:
coef = grid_search.best_estimator_.coef_.tolist()
coef = coef[0]
coef

[-0.5990456818288621,
 2.528376047594987,
 -0.7848328922139243,
 3.7138880374791756,
 3.5344122817328034,
 3.1907671571292635,
 -0.20433411626551062,
 -0.033890556991934793]

creer un data set lasso avec les colonnes à 0 en moins, et donc on peut utiliser nimporte quel modele, ne pas oublier d'utliser meme preprocessor

In [200]:
# Extraire le nouveau de nom de feature apres que le onehot encoding ait créé de nouvelles colonnes
feature_names = preprocessor.get_feature_names_out(features_list).tolist()
feature_names

['num__age',
 'num__total_pages_visited',
 'num__new_user',
 'cat__country_Germany',
 'cat__country_UK',
 'cat__country_US',
 'cat__source_Direct',
 'cat__source_Seo']

In [206]:
coef_features = {'Features':feature_names, 'Coeff_Lasso':coef}
coef_df=pd.DataFrame(coef_features)

In [212]:
coef_df

Unnamed: 0,Features,Coeff_Lasso
0,num__age,-0.599046
1,num__total_pages_visited,2.528376
2,num__new_user,-0.784833
3,cat__country_Germany,3.713888
4,cat__country_UK,3.534412
5,cat__country_US,3.190767
6,cat__source_Direct,-0.204334
7,cat__source_Seo,-0.033891


In [218]:
coef_df['Coeff_Lasso'] = coef_df['Coeff_Lasso'].abs()
coef_df.sort_values(by='Coeff_Lasso', ascending=True, inplace=True)

In [219]:
px.histogram(coef_df, 'Features', 'Coeff_Lasso')

Pretty good rank on the general bilboard, but I could improve by adding parameters in my grid search + boost with XBG or Ada etc !

## SVM 

not great and too long, stop 

In [11]:
svm = SVC(kernel = 'poly', class_weight='balanced')

In [134]:
# params = {'C' : [1,5,10],
#           'gamma' : [0.0001, 0.0005, 0.001, 0.005]}

# svm_gridsearch = GridSearchCV(svm, param_grid=params)
# svm_gridsearch

In [135]:
# svm_gridsearch.fit(X_train, y_train)

In [12]:
svm.fit(X_train, y_train)

Because we want to make sure no money is lost, precision would be better to maximise here

## Submission

In [227]:
X = np.append(X_train,X_test,axis=0)
y = np.append(y_train,y_test)

best_lasso.fit(X,y)

In [228]:
# import du data_test
test = pd.read_csv('./src/conversion_data_test.csv')


In [230]:
features_list = ['country', 'age', 'new_user', 'source', 'total_pages_visited']
X_without_labels = test.loc[:, features_list]

X_without_labels = X_without_labels

# X_without_labels = X_without_labels.value When only one feature ?

In [231]:
X_without_labels

Unnamed: 0,country,age,new_user,source,total_pages_visited
0,UK,28,0,Seo,16
1,UK,22,1,Direct,5
2,China,32,1,Seo,1
3,US,32,1,Ads,6
4,China,25,0,Seo,3
...,...,...,...,...,...
31615,Germany,25,1,Seo,3
31616,US,36,1,Ads,7
31617,UK,33,1,Seo,5
31618,UK,25,1,Seo,14


In [232]:
X_without_labels = preprocessor.transform(X_without_labels)

In [234]:
preprocessor.get_feature_names_out()

array(['num__age', 'num__total_pages_visited', 'num__new_user',
       'cat__country_Germany', 'cat__country_UK', 'cat__country_US',
       'cat__source_Direct', 'cat__source_Seo'], dtype=object)

In [235]:
# faire vos prédictions
predictions = best_lasso.predict(X_without_labels)
# les stocker en csv
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])


In [236]:
len(predictions_df)

31620

In [238]:
predictions_df.to_csv('conversion_data_test_predictions_ROMAINP-LogReg-best-L1.csv', index=False)
#Bien suivre la regle de comment nommer le csv -> conversiondata_test_predictions{name}-{model}.csv