In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/analyticsvjobathon/train_wn75k28.csv
/kaggle/input/analyticsvjobathon/sample_submission_2zvVjBu.csv
/kaggle/input/analyticsvjobathon/test_Wf7sxXF.csv


The problem was to find the potential lead for the startup by identifying the leads which will buy product in next 3 months given the data containing various information regarding user’s past activity including whether the user bought a product in upcoming 3 months.
Since, the given dataset included whether the lead bought the product in the past or not, it was a Supervised Learning Problem.
This was a classification problem with target variable (‘buy’) taking 2 values: 0 (lead won’t buy the product) and 1(lead will buy the product).
The training set had 17 variables where 2 of them had dates and rest of them had integer type data.
Dates are non-acceptable form of data types for most models. So, Dates were changed to suitable data type.
Different suitable classification models were tried and the one with the highest F1 score was used.
Training dataset was split into 2 sets containing 75% and 25% (validation set) of the whole data. Validation dataset was used to tune the parameters of the model selected.
The approach with the highest F1 score on validation set was finalized and used to predict the target variable ‘buy’ on given test dataset.

***Preprocessing the dataset:***
There were two problems with the dataset provided:
1. Data type of ‘created_ at’ and ‘signup_date’ columns:
They could not be used as they contained ‘object’ type data and that could not be directly changed to useful integers. But, calculating how far ago leads were dropped and signed up on the website can be a useful indicator of whether they will buy the product or not. So, all dates were changed to how many days ago the lead dropped or signed up. The days could be easily converted to integer data type.
2. Missing values in ‘products_ purchased’ and ‘signup_date’ columns:
Missing values could indicate no product purchased in ‘products_purchased’ and no signup in ‘signup_date’. The natural step will be to replace missing values with 0 in both the columns.
After these steps, all the features were normalized as the range of values were very different for different features given which might affect our accuracy in predictions (as also found when tested).

In [2]:
import datetime
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier

In [3]:
#Reading data from train and test datasets
file_path = '../input/analyticsvjobathon/train_wn75k28.csv'
lead_data = pd.read_csv(file_path,index_col='id')

file_path2='../input/analyticsvjobathon/test_Wf7sxXF.csv'
X_test=pd.read_csv(file_path2,index_col='id')

In [4]:
#Seperating predictors and target values

X=lead_data.copy()
y=X.buy
X_full=X.drop(['buy'], axis=1)



In [5]:
#Understanding data
X_full.info()
X_full.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39161 entries, 1 to 39161
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   created_at            39161 non-null  object 
 1   campaign_var_1        39161 non-null  int64  
 2   campaign_var_2        39161 non-null  int64  
 3   products_purchased    18250 non-null  float64
 4   signup_date           24048 non-null  object 
 5   user_activity_var_1   39161 non-null  int64  
 6   user_activity_var_2   39161 non-null  int64  
 7   user_activity_var_3   39161 non-null  int64  
 8   user_activity_var_4   39161 non-null  int64  
 9   user_activity_var_5   39161 non-null  int64  
 10  user_activity_var_6   39161 non-null  int64  
 11  user_activity_var_7   39161 non-null  int64  
 12  user_activity_var_8   39161 non-null  int64  
 13  user_activity_var_9   39161 non-null  int64  
 14  user_activity_var_10  39161 non-null  int64  
 15  user_activity_var_1

Unnamed: 0_level_0,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2021-01-01,1,2,2.0,2020-09-24,0,0,0,0,0,0,1,1,0,0,0,0
2,2021-01-01,2,1,2.0,2020-09-19,1,0,1,0,0,0,1,0,0,0,0,0
3,2021-01-01,9,3,3.0,2021-08-11,1,0,0,0,0,0,0,0,0,0,0,0
4,2021-01-01,6,7,2.0,2017-10-04,0,0,0,0,0,0,0,0,0,0,0,0
5,2021-01-01,4,6,,2020-06-08,0,0,0,0,0,0,1,0,0,0,1,0


In [6]:
#Chnaging dates to datetime datatype for manipulation
X_full['created_at'] = pd.to_datetime(X_full['created_at'])
X_full['signup_date']=pd.to_datetime(X_full['signup_date'])

In [7]:
#Changing dates to time elapsed from today
dt = datetime.date.today()
today = pd.DatetimeIndex([dt])[0]
X_full['signup_date']=today-X_full['signup_date']
X_full['created_at']=today-X_full['created_at']

In [8]:
X_full.info()
X_full.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 39161 entries, 1 to 39161
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype          
---  ------                --------------  -----          
 0   created_at            39161 non-null  timedelta64[ns]
 1   campaign_var_1        39161 non-null  int64          
 2   campaign_var_2        39161 non-null  int64          
 3   products_purchased    18250 non-null  float64        
 4   signup_date           24048 non-null  timedelta64[ns]
 5   user_activity_var_1   39161 non-null  int64          
 6   user_activity_var_2   39161 non-null  int64          
 7   user_activity_var_3   39161 non-null  int64          
 8   user_activity_var_4   39161 non-null  int64          
 9   user_activity_var_5   39161 non-null  int64          
 10  user_activity_var_6   39161 non-null  int64          
 11  user_activity_var_7   39161 non-null  int64          
 12  user_activity_var_8   39161 non-null  int64          
 13  u

Unnamed: 0_level_0,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,615 days,1,2,2.0,714 days,0,0,0,0,0,0,1,1,0,0,0,0
2,615 days,2,1,2.0,719 days,1,0,1,0,0,0,1,0,0,0,0,0
3,615 days,9,3,3.0,393 days,1,0,0,0,0,0,0,0,0,0,0,0
4,615 days,6,7,2.0,1800 days,0,0,0,0,0,0,0,0,0,0,0,0
5,615 days,4,6,,822 days,0,0,0,0,0,0,1,0,0,0,1,0


In [9]:
#Exctracting days and changing datatypes to integers
X_full['created_at'] = (X_full['created_at'] / np.timedelta64(1,'D')).astype(int,errors='ignore')
X_full['signup_date'] = (X_full['signup_date'] / np.timedelta64(1,'D')).astype(int,errors='ignore')

In [10]:
X_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39161 entries, 1 to 39161
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   created_at            39161 non-null  int64  
 1   campaign_var_1        39161 non-null  int64  
 2   campaign_var_2        39161 non-null  int64  
 3   products_purchased    18250 non-null  float64
 4   signup_date           24048 non-null  float64
 5   user_activity_var_1   39161 non-null  int64  
 6   user_activity_var_2   39161 non-null  int64  
 7   user_activity_var_3   39161 non-null  int64  
 8   user_activity_var_4   39161 non-null  int64  
 9   user_activity_var_5   39161 non-null  int64  
 10  user_activity_var_6   39161 non-null  int64  
 11  user_activity_var_7   39161 non-null  int64  
 12  user_activity_var_8   39161 non-null  int64  
 13  user_activity_var_9   39161 non-null  int64  
 14  user_activity_var_10  39161 non-null  int64  
 15  user_activity_var_1

In [11]:
#Splitting data in training and test sets
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, test_size=0.25, random_state=1)

***Final Model:***
The model was trained on train dataset, parameters were tested on validation set and finally fitted to test set.
After trying different models like logistic regression, random forest classifier etc., the model that gave the highest F1 score on validation set was selected. The final model was Neural Network (Multi-layer Perceptron Classifier). Different values for the different parameter like no. of hidden units, maximum iteration were tested and the ones with highest F1 score on validation set were selected.
That was the final model which was finally applied to test set given and uploaded.

In [12]:
#Preprocessing data
cols = X_train.columns

#using simpleimputer for imputing missing values and minmaxscaler for normalisation
imp=SimpleImputer(strategy='constant')
scaler = MinMaxScaler()
X_train = imp.fit_transform(X_train)
X_valid = imp.transform(X_valid)

X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

X_train= pd.DataFrame(X_train, columns=[cols])
X_valid = pd.DataFrame(X_valid, columns=[cols])
X_train.describe()
X_train.head()

Unnamed: 0,created_at,campaign_var_1,campaign_var_2,products_purchased,signup_date,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12
0,0.653846,0.333333,0.214286,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
1,0.296703,0.266667,0.214286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.5,0.0,0.0,0.0,0.0
2,0.711538,0.133333,0.0,0.5,0.118599,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.21978,0.266667,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.5,0.0,0.0,0.0,0.25,0.0
4,0.513736,0.4,0.571429,0.0,0.434111,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#After tuning the various paramters on validation set (giving highest F1 score):
#the final parameters are used to train the model.
model = MLPClassifier(random_state=1,hidden_layer_sizes=(30,20),max_iter=250)
model.fit(X_train, y_train)
preds = model.predict(X_valid)
F1=f1_score(y_valid,preds)
print(F1)



0.7193195625759417




In [14]:
ID =X_test.index
X_test['created_at'] = pd.to_datetime(X_test['created_at'])
X_test['signup_date']=pd.to_datetime(X_test['signup_date'])
X_test['signup_date']=today-X_test['signup_date']
X_test['created_at']=today-X_test['created_at']
X_test['created_at'] = (X_test['created_at'] / np.timedelta64(1,'D')).astype(int,errors='ignore')
X_test['signup_date'] = (X_test['signup_date'] / np.timedelta64(1,'D')).astype(int,errors='ignore')
X_test = imp.transform(X_test)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=[cols])
preds = model.predict(X_test)




In [15]:
output = pd.DataFrame({'id': ID,
                       'buy': preds})
output.to_csv('submission.csv', index=False)