#### Import Essential Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sb
from matplotlib import pyplot as plt
import re
from datetime import datetime, date
import time

# Sklearn Libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

#### Essential Functions

In [None]:
# Find average age
def average(lst): 
    return sum(lst) / len(lst)

# Remove the '@' symbol and get the email company name
def refineEmailDomain(domain):
    domain = re.sub('@', '', domain)
    words = domain.split('.')
    return words[0]

# Convert the timestamp in string to seconds unit
def convertTime(timestamp):
    d = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    return time.mktime(d.timetuple())

# Get the time zone of with the timestamp in string
def getTimeZone(timestamp):
    return timestamp[-6:]

# Remove the time zone from the timestamp in string
def removeTimeZone(timestamp):
    return timestamp[:19]

# Get the difference of the timestamp with the current time
def getDateDifference(timestamp):
    timestamp = str(timestamp)
    d1 = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    
    today = date.today()
    today = str(today)
    d2 = datetime.strptime(today, "%Y-%m-%d")
    
    return abs((d2 - d1).days)

In [None]:
# Importing the train dataset
trainDF = pd.read_csv('../input/student-shopee-code-league-marketing-analytics/train.csv')
print(f"There are {len(trainDF)} data points.")
trainDF.head()

In [None]:
# Finding the unique rows of the train data
trainDF['row_id'].unique()

In [None]:
# Finding the number of users in the dataset
len(trainDF['user_id'].unique())

In [None]:
# Importing the users dataframe
usersDF = pd.read_csv('../input/student-shopee-code-league-marketing-analytics/users.csv')
print(f"There are {len(usersDF['user_id'].unique())} unique users.")

### Data Preprocessing

In [None]:
# For all users with age that is NaN, fill them with zeroes and find the average age
allUsersAge = usersDF['age'].fillna(0)
averageAge = average(allUsersAge.tolist())

# Fill all users with age that is NaN to finally fill the average age
usersDF['age'] = usersDF['age'].fillna(round(averageAge))

# If the user has age that is <=0, to fill them with the average age
usersDF.loc[usersDF.age <= 0, 'age'] = averageAge

In [None]:
# Find the unique email domains of the user
usersDF['domain'].unique()

In [None]:
# Edit the domains to remove the '@' character
usersDF['domainCategory'] = usersDF['domain'].apply(refineEmailDomain)

In [None]:
usersDF.head()

In [None]:
# Get only the relevant attributes needed from the users dataset
requiredUsers = usersDF[['user_id', 'age', 'domainCategory']]
requiredUsers.head()

### Combine the Data

In [None]:
combinedDF = pd.merge(trainDF, requiredUsers, on = 'user_id')

In [None]:
# Find the unique data in the attributes
for i in combinedDF.columns:
    if i not in ['grass_date', 'user_id', 'last_open_day', 'last_checkout_day', 'last_login_day', 'login_count_last_60_days',
                 'login_count_last_30_days', 'checkout_count_last_60_days', 'row_id', 'subject_line_length',
                 'open_count_last_60_days', 'login_count_last_10_days', 'checkout_count_last_30_days', 'age',
                 'checkout_count_last_10_days']:
        print(i)
        print(combinedDF[i].unique())

In [None]:
# Find the maximum and minimum data from the attribute
for i in combinedDF.columns:
    if i not in ['grass_date', 'user_id', 'row_id', 'domainCategory']:
        print(i)
        print(max(combinedDF[i]), ',', min(combinedDF[i]))

In [None]:
# Preprocess the data by making adjustments such as to remove string data and removing NaN values

# Remove the time zone from the timestamp and convert it to seconds
combinedDF['grass_date'] = combinedDF['grass_date'].apply(removeTimeZone)
combinedDF['timeInSec'] = combinedDF['grass_date'].apply(convertTime)

# Drop the rows if there is never login/checkout in the dataset
combinedDF['last_login_day'] = combinedDF['last_login_day'].drop(combinedDF[combinedDF['last_login_day'] == 'Never login'].index)
combinedDF['last_checkout_day'] = combinedDF['last_checkout_day'].drop(combinedDF[combinedDF['last_checkout_day'] == 'Never checkout'].index)

# Fill NaN values with 0 and convert to integer
combinedDF['last_login_day'] = combinedDF['last_login_day'].fillna(0)
combinedDF['last_login_day'] = combinedDF['last_login_day'].apply(lambda x:int(x))

combinedDF['last_checkout_day'] = combinedDF['last_checkout_day'].fillna(0)
combinedDF['last_checkout_day'] = combinedDF['last_checkout_day'].apply(lambda x:int(x))

# Get difference between dates of current date and timestamp, then fill NaN values with 0 and convert to integer
combinedDF.loc[combinedDF['last_open_day'] == 'Never open', 'last_open_day'] = combinedDF['grass_date'].apply(getDateDifference)
combinedDF['last_open_day'] = combinedDF['last_open_day'].fillna(0)
combinedDF['last_open_day'] = combinedDF['last_open_day'].apply(lambda x:int(x))

# Encode the email domain category
encodeDomain = LabelEncoder()
combinedDF['domainLabel'] = encodeDomain.fit_transform(combinedDF['domainCategory'])

combinedDF.head()

In [None]:
combinedDF.columns

In [None]:
modelDF = combinedDF.copy()
modelDF = modelDF.drop(columns = ['grass_date'])
modelDF.head()

In [None]:
print(f"There are {len(modelDF['user_id'].unique())} unique users.")
print(f"There are {len(modelDF)} data points.")
print(f"There are {len(modelDF['row_id'].unique())} unique row ids.")

## Model Training

In [None]:
# Prepare empty data frame to get the results from both models
resultDF = pd.DataFrame()
resultDF = modelDF[['row_id', 'user_id']]

In [None]:
# Prepare attributes for numerical and categorical based models
numericalAttributes = ['subject_line_length', 'last_open_day', 'last_login_day', 'last_checkout_day',
                       'open_count_last_10_days', 'open_count_last_30_days', 'open_count_last_60_days',
                       'login_count_last_10_days', 'login_count_last_30_days', 'login_count_last_60_days',
                       'checkout_count_last_10_days', 'checkout_count_last_30_days', 'checkout_count_last_60_days',
                       'timeInSec', 'age']
categoricalAttributes = ['country_code', 'user_id', 'domainLabel']

In [None]:
# Train Test Split
X = modelDF[numericalAttributes]
y = modelDF['open_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Training the numerical based model
numericalModel = RandomForestClassifier(max_depth=50, random_state=0)
numericalModel.fit(X_train, y_train)
print(numericalModel.score(X_train, y_train))

In [None]:
numericalModel.score(X_test, y_test)

In [None]:
# Obtain the predicted results
y_numerical_pred = numericalModel.predict_proba(X)
resultDF['NumericalResult'] = y_numerical_pred[:,1]

In [None]:
# Train Test Split
X = modelDF[categoricalAttributes]
y = modelDF['open_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Training the categorical based model
categoricalModel = svm.SVC(probability=True)
categoricalModel.fit(X_train, y_train)
print(categoricalModel.score(X_train, y_train))

In [None]:
categoricalModel.score(X_test, y_test)

In [None]:
# Obtain the predicted results
y_categorical_pred = categoricalModel.predict_proba(X)
resultDF['CategoricalResult'] = y_categorical_pred[:,1]

In [None]:
# Append the response variable to the data frame
resultDF['open_flag'] = modelDF['open_flag']
resultDF.head()

In [None]:
# Train Test Split
X = resultDF[['NumericalResult', 'CategoricalResult']]
y = resultDF['open_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Training the model for model stacking of numerical and categorical models
modelStacking = RandomForestClassifier(max_depth=50, random_state=0)
modelStacking.fit(X_train, y_train)
print(modelStacking.score(X_train, y_train))

In [None]:
modelStacking.score(X_test, y_test)

## Test Data Prediction

In [None]:
# Import the test data
testDF = pd.read_csv('../input/student-shopee-code-league-marketing-analytics/test.csv')
print(f"There are {len(testDF)} test data points.")
testDF.head()

In [None]:
# Apply the same set of preprocessing method on test data

combinedTestDF = pd.merge(testDF, requiredUsers, on = 'user_id')

combinedTestDF['grass_date'] = combinedTestDF['grass_date'].apply(removeTimeZone)
combinedTestDF['timeInSec'] = combinedTestDF['grass_date'].apply(convertTime)

combinedTestDF['last_login_day'] = combinedTestDF['last_login_day'].drop(combinedTestDF[combinedTestDF['last_login_day'] == 'Never login'].index)
combinedTestDF['last_checkout_day'] = combinedTestDF['last_checkout_day'].drop(combinedTestDF[combinedTestDF['last_checkout_day'] == 'Never checkout'].index)

combinedTestDF['last_login_day'] = combinedTestDF['last_login_day'].fillna(0)
combinedTestDF['last_login_day'] = combinedTestDF['last_login_day'].apply(lambda x:int(x))

combinedTestDF['last_checkout_day'] = combinedTestDF['last_checkout_day'].fillna(0)
combinedTestDF['last_checkout_day'] = combinedTestDF['last_checkout_day'].apply(lambda x:int(x))

combinedTestDF.loc[combinedTestDF['last_open_day'] == 'Never open', 'last_open_day'] = combinedTestDF['grass_date'].apply(getDateDifference)
combinedTestDF['last_open_day'] = combinedTestDF['last_open_day'].fillna(0)
combinedTestDF['last_open_day'] = combinedTestDF['last_open_day'].apply(lambda x:int(x))

combinedTestDF['domainLabel'] = encodeDomain.fit_transform(combinedTestDF['domainCategory'])

combinedTestDF.head()

In [None]:
# Append the new data frame with results from numerical and categorical models
finalResults = pd.DataFrame()
finalResults = combinedTestDF[['row_id', 'user_id']]

X1 = combinedTestDF[numericalAttributes]
y_numerical_Final = numericalModel.predict_proba(X1)
finalResults['NumericalResult'] = y_numerical_Final[:,1]

X2 = combinedTestDF[categoricalAttributes]
y_catrgorical_Final = categoricalModel.predict_proba(X2)
finalResults['CategoricalResult'] = y_catrgorical_Final[:,1]

In [None]:
# Make predictions
FinalX = finalResults[['NumericalResult', 'CategoricalResult']]
Final_y_pred = modelStacking.predict(FinalX)

In [None]:
# Prepare submission data frame
submission = pd.DataFrame({'row_id': finalResults['row_id'].tolist(),
                           'open_flag': Final_y_pred.tolist()
                          })

In [None]:
submission.head()

## Submit

In [None]:
sampleSubmission = pd.read_csv('../input/student-shopee-code-league-marketing-analytics/sample_submission_0_1.csv')
sampleSubmission

In [None]:
submission.to_csv('The_OG_submission.csv', index = False)