## Cleaning users 💥 Features engineering

Hello Kagglers**** 🖐🖐 

Here is my approach ⏬⏬⏬

# 1. Import of training data and preliminary exploration

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import squarify
import matplotlib.pyplot as plt
plt.style.use("default")
from IPython.display import Image

In [None]:
def convert_from_ms(milliseconds):
    seconds, milliseconds = divmod(milliseconds, 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    days, hours = divmod(hours, 24)
    seconds = seconds + milliseconds / 1000
    return days, hours, minutes, round(seconds)

I am using the dataset compressed in this [kernel](https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets/)

Thanks to Rohan ! 😉

In [None]:
%%time

path = "../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip"
train = pd.read_pickle(path)
train = train.astype({'row_id': 'int64',
                      'timestamp': 'int64',
                      'user_id': 'int32',
                      'content_id': 'int16',
                      'content_type_id': 'int8',
                      'task_container_id': 'int16',
                      'user_answer': 'int8',
                      'answered_correctly': 'int8',
                      'prior_question_elapsed_time': 'float32',
                      'prior_question_had_explanation': 'boolean'})

train.info()

In [None]:
train.head()

In [None]:
train.isna().sum()

The training data set is substantial; more than 100 million lines 😲😲 It is necessary to conduct a deep exploration and possibly a deep cleansing. I focus the cleaning on the users.

💔 Do not remove interactions no matter how 💔

# 2. Cleaning user
## 2-1. Building the user dataframe

In [None]:
n_user = train.user_id.nunique()
print(f'There are {n_user} users')

I count the number of interactions for a user and i recover the number of days in the targeted interaction and the first. I calculate the score 💯 for informations

In [None]:
%%time

agg = {'row_id' : 'count',
       'timestamp' : lambda x: convert_from_ms(x.max())[0],
       'answered_correctly' : lambda x: round(x.mean() * 100)}

user_info = train.groupby('user_id').agg(agg)
user_info = user_info.rename(columns={'row_id' : 'nb_interactions',
                                   'timestamp' : 'nb_jours',
                                   'answered_correctly' : 'score'})
user_info

In [None]:
user_info.describe(percentiles=[.05, .25, .5, .75, 0.95])

In [None]:
%%time
fig = sns.pairplot(user_info, diag_kind="kde", plot_kws={'alpha': 0.01})
fig.savefig('./img_eda_user_pairplot.png', transparent=True)

In [None]:
sns.distplot(user_info.nb_interactions, hist=False)
plt.xscale('log')
plt.title('Nb interactions with log scale')
plt.savefig('./img_eda_user_nb_interactions_log_scale.png', transparent=True)

In [None]:
sns.distplot(user_info.nb_jours, hist=False)
plt.xscale('log')
plt.title('Nb days with log scale')
plt.savefig('./img_eda_user_nb_days_log_scale.png', transparent=True)

## 2-2. Number of interactions per user

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=1,ncols=2, sharey=False, figsize=(7,4))
sns.boxplot(y=user_info['nb_interactions'], ax=ax0)
sns.boxplot(y=user_info['nb_interactions'], ax=ax1)
ax0.set(title="Number of interactions per user")
ax1.set(ylim=(0,500), title="Focus on the box", ylabel="")
fig.savefig('./img_eda_interaction_boxplot.png', transparent=True)

Here I see big differences especially from the 95th percentile 💥💥 It is difficult for me to be able to generalize a model from these data, it must be excluded!

## 2-3. Number of days between the first interaction and the last

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=1,ncols=2, sharey=False, figsize=(7,4))
sns.boxplot(y=user_info['nb_jours'], ax=ax0)
sns.boxplot(y=user_info['nb_jours'], ax=ax1)
ax0.set(title="Number of days per user")
ax1.set(ylim=(0,150), title="Focus on the box", ylabel="")
fig.savefig('./img_eda_jour_boxplot.png', transparent=True)

Here, too, it is advisable to clean ✅

## 2-4. Apply filters

In [None]:
# Régler ici les seuils de nettoyage
Q1 = 0.05
Q4 = 0.95

# Filtre des interaction exceptionnelles
q1_int = user_info.nb_interactions.quantile(Q1)
q4_int = user_info.nb_interactions.quantile(Q4)
cond_int = (user_info.nb_interactions > q1_int) & (user_info.nb_interactions < q4_int)

# Filtre des délais exceptionnels
q1_delay = user_info.nb_jours.quantile(Q1)
q4_delay = user_info.nb_jours.quantile(Q4)
cond_delay = (user_info.nb_jours > q1_delay) & (user_info.nb_jours < q4_delay)

reduced_user_info = user_info[cond_int & cond_delay]
reduced_user_info

In [None]:
reduced_n_user = len(reduced_user_info)

reduced_user_rate = round((1 - reduced_n_user / n_user) * 100)
print(f'Users have been reduced by {reduced_user_rate} %.')

In [None]:
%%time
fig = sns.pairplot(reduced_user_info,  diag_kind="kde", plot_kws={'alpha': 0.01})
fig.savefig('./img_eda_reduced_user_pairplot.png', transparent=True)

Distributions are less compressed ❗ I apply cleansing to the train data.

In [None]:
sns.distplot(reduced_user_info.nb_interactions, hist=False)
plt.xscale('log')
plt.title('Nb interactions with log scale')
plt.savefig('./img_eda_reduced_user_nb_interactions_log_scale.png', transparent=True)

In [None]:
sns.distplot(reduced_user_info.nb_jours, hist=False)
plt.xscale('log')
plt.title('Nb days with log scale')
plt.savefig('./img_eda_reduced_user_nb_days_log_scale.png', transparent=True)

In [None]:
reduced_train = pd.merge(train, reduced_user_info, on='user_id')

col = train.columns
reduced_train = reduced_train[col]
reduced_train

In [None]:
reduced_train_rate = round((1 - len(reduced_train) / len(train)) * 100)
print(f'Interactions were reduced by {reduced_train_rate} %.')

In [None]:
reduced_train.to_pickle("./reduced_riiid_train.pkl.gzip")

# 3. Features engineering
## 3-1. Help usage

Some of the interactions are considered as conferences, no response is required from the user (answered_correctly = -1). I remove them from the data set.

Before, it is interesting to extract a feature on the level of use of the help.

In [None]:
help_usage = reduced_train[reduced_train.answered_correctly == -1].groupby('user_id')['content_id'].count()
help_usage = help_usage.reset_index().rename(columns={'content_id': 'help_usage'})
help_usage

In [None]:
help_usage.describe()

In [None]:
sns.boxplot(y='help_usage', data=help_usage)
plt.title('Help usage of users')
plt.savefig('./img_eda_help_usage.png', transparent=True)

In [None]:
help_usage['binned_help_usage'] = pd.cut(help_usage['help_usage'], bins=[-np.inf, 0, 1, 3, np.inf], labels=[0, 1, 2, 3])
help_usage

In [None]:
# Export file for submissions
col = ['user_id', 'binned_help_usage']
help_usage = help_usage[col]
help_usage[col].to_csv('./help.csv', index=False)

# Delete conferences
reduced_train = reduced_train[reduced_train.content_type_id == 0]
reduced_train.shape

## 3-2. Content difficult


The TOEIC exam is classified into 2 parts; listening and reading. Each part has difficulty groups, it is interesting to extract this information.

In [None]:
path = "../input/riiid-test-answer-prediction/questions.csv"
questions = pd.read_csv(path)
questions.info()

In [None]:
col = ['question_id','part']
questions = questions[col]
questions = questions.rename(columns={'question_id': 'content_id'})

In [None]:
# Listening (0) or reading (1)
questions['L | R'] = pd.cut(questions['part'], bins=[-np.inf, 4, np.inf], labels=['Listening', 'Reading'])
questions.sample(10)

In [None]:
questions['Difficulty_level'] = 0
questions['Difficulty_level'][questions['L | R'] == 'Listening'] = questions['part'][questions['L | R'] == 'Listening']
questions['Difficulty_level'][questions['L | R'] == 'Reading'] = questions['part'][questions['L | R'] == 'Reading'] - 4

questions.sample(10)

In [None]:
questions.to_csv('./level_content.csv', index=False)

## 3-3. New features exploration

In [None]:
featured_train = pd.merge(reduced_train, help_usage, on='user_id', how='left')
featured_train = pd.merge(featured_train, questions, on='content_id', how='left')
# Users with no help usage equals 0
featured_train.binned_help_usage = featured_train.binned_help_usage.fillna(0)

### Parts

In [None]:
%%time

agg = {'row_id' : 'count',
       'answered_correctly' : lambda x: round(x.mean() * 100),
       'prior_question_had_explanation' : lambda x: round(x.mean() * 100)}

part_info = featured_train.groupby('part').agg(agg)
part_info = part_info.rename(columns={'row_id' : 'nb_interactions',
                                   'answered_correctly' : 'score'})
part_info

In [None]:
norm = matplotlib.colors.Normalize(vmin=min(part_info.score), vmax=max(part_info.score))
colors = [matplotlib.cm.Blues(norm(value)) for value in part_info.score]
squarify.plot(sizes=part_info.nb_interactions, color=colors, label=part_info.index, alpha=0.8)
plt.title("Part of TOEIC test")
plt.axis('off')
plt.savefig('./img_part_toeic_test_repartitions.png', transparent=True)
plt.show()

The weaker the colors, the lower the score.
Part 5 is the most worked on in the data, but is the one with the lowest score.
This is also where users require the most explanation.

### Content

In [None]:
featured_train.groupby("L | R")['row_id'].count().reset_index().plot.bar(x='L | R', y='row_id')
plt.title('Interactions of Listening or Reading')
plt.savefig('./img_part_LR_repartitions.png',
            transparent=True,
            bbox_inches="tight")
#sns.barplot(y=, data=featured_train)

### Level usage

In [None]:
featured_train.groupby('binned_help_usage')['row_id'].count().reset_index().plot.bar(x='binned_help_usage', y='row_id')
plt.title('Users by help usage group ')
plt.savefig('./img_help_usage_repartitions.png', transparent=True)

# 4. Reduced train dataset export

In [None]:
col = ['timestamp', 'user_id', 'content_id',
       'task_container_id', 'binned_help_usage', 'L | R', 'Difficulty_level', 'answered_correctly']
featured_train = featured_train[col]
featured_train.sample(20)

In [None]:
featured_train.to_pickle("./featured_riiid_train.pkl.gzip")

## Baseline Coming soon

If you made it to the point thank you for reading and 🆙 vote if it helped you. 
I will read your comments with pleasure.

Oooh ! And i'm soory for my perfectible english !

Thanks 😷 Stay at home 😷