# Импорт модулей

In [149]:
import sys
sys.path.append('/content/drive/MyDrive/Stepic_ML/Stepic_contest')

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest as iso

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize': (18, 8)})

# Функции

In [150]:
def timestamp_to_date_day(data):
    #получение даты(date) и дня(day) из timestamp
    data['date'] = pd.to_datetime(data.timestamp, unit='s')
    data['day'] = data.date.dt.date
    return data

In [151]:
def quantile_emission_treatment(data, threshold=0.999):
  	#отбор по квантилю
    data_quantile = data.quantile(threshold)
    for action, value in zip(data_quantile.keys(), data_quantile):
        data = data[data[action] <= value]
    return data

In [152]:
def n_days_selection(data, n_days=2):
  	#отбор первых n дней
    timestamp_trashold = data.groupby('user_id', as_index=False)\
        .agg({'timestamp': 'min'})\
        .rename({'timestamp': 'min_timestamp'}, axis=1)

    timestamp_trashold['timestamp_trashold'] = timestamp_trashold['min_timestamp'] + n_days*24*60*60

    data_by_n_days = data.merge(timestamp_trashold, on='user_id', how='outer')
    data_by_n_days = data_by_n_days[data_by_n_days['timestamp'] <= data_by_n_days['timestamp_trashold']]
    assert data['user_id'].nunique() == data_by_n_days['user_id'].nunique()
    
    return data_by_n_days
    
def get_pivot_table(event_data, submission_data):
    # Получение pivot_table для event_data и submission_data
    event_pivot_table = event_data.pivot_table('step_id', 
                                               'user_id', 
                                               'action', 
                                               aggfunc='count')\
                                                .fillna(0).reset_index()
    submission_pivot_table = submission_data.pivot_table('step_id', 'user_id',
                                                                   'submission_status',
                                                                   aggfunc='count')\
                                                                   .fillna(0).reset_index()
    return event_pivot_table, submission_pivot_table

In [153]:
def add_unique_steps(data, pivot_table, column, feature):
    # Добавление unique_feture к pivot_table
    unique_data = data[['step_id', 'user_id', f'{column}']]\
                                .drop_duplicates().pivot_table(index='user_id',
                                columns=f'{column}',
                                values='step_id',
                                aggfunc='count')\
                                .fillna(0).reset_index().\
                                rename({f'{feature}': f'unique_{feature}'}, axis=1)

    pivot_table = pivot_table.merge(unique_data[['user_id',f'unique_{feature}']],
                                 on='user_id',
                                 how='outer')
    return pivot_table

In [154]:
def get_feature(df):
    df['sum(correct+wrong)'] = df['correct'] + df['wrong']
    #df['unique_sum(correct+wrong)'] = df['unique_correct'] + df['unique_wrong']
    df['ratio(passed/started_attempt)'] = df['passed'] / df['started_attempt']
    df['ratio(unique_passed/unique_started_attempt)'] = df['unique_passed'] / df['unique_started_attempt']
    df['ratio(unique_correct/unique_started_attempt)'] = df['unique_correct'] / df['unique_started_attempt']

    #df['ratio(viewed/discovered)'] = df['viewed'] / df['discovered']
    #df['ratio(passed/discovered)'] = df['passed'] / df['discovered']
    #df['ratio(started_attempt/discovered)'] = df['started_attempt'] / df['discovered']
    #df['ratio(passed/viewed)'] = df['passed'] / df['viewed']
    #df['ratio(discovered/event_sum(action))'] = df['discovered'] / df['event_sum(action)']
    #df['ratio(passed/event_sum(action))'] = df['passed'] / df['event_sum(action)']
    #df['ratio(started_attempt/event_sum(action))'] = df['started_attempt'] / df['event_sum(action)']
    #df['ratio(viewed/event_sum(action))'] = df['viewed'] / df['event_sum(action)']

    #df['ratio(unique_viewed/unique_discovered)'] = df['unique_viewed'] / df['unique_discovered']
    #df['ratio(unique_passed/unique_discovered)'] = df['unique_passed'] / df['unique_discovered']
    #df['ratio(unique_started_attempt/unique_discovered)'] = df['unique_started_attempt'] / df['unique_discovered']
    #df['ratio(unique_passed/unique_viewed)'] = df['unique_passed'] / df['unique_viewed']
    #df['ratio(unique_discovered/event_sum(unique_action))'] = df['discovered'] / df['event_sum(unique_action)']
    #df['ratio(unique_passed/event_sum(unique_action))'] = df['unique_passed'] / df['event_sum(unique_action)']
    #df['ratio(unique_started_attempt/event_sum(unique_action))'] = df['unique_started_attempt'] / df['event_sum(unique_action)']
    #df['ratio(unique_viewed/event_sum(unique_action))'] = df['unique_viewed'] / df['event_sum(unique_action)']

    #df['ratio(event_sum(unique_action)/event_sum(action))'] = df['event_sum(unique_action)'] / df['event_sum(action)']

    df['ratio(correct/started_attempt)'] = df['correct'] / df['started_attempt']
    #df['ratio(wrong/started_attempt)'] = df['wrong'] / df['started_attempt']
    
    #df['ratio(unique_wrong/unique_started_attempt)'] = df['unique_wrong'] / df['unique_started_attempt']
    #df['ratio(unique_sum(correct+wrong)/unique_started_attempt)'] = df['unique_sum(correct+wrong)'] / df['unique_started_attempt']
    df['ratio(sum(correct+wrong)/started_attempt)'] = df['sum(correct+wrong)'] / df['started_attempt']
    df['ratio(correct/sum(correct+wrong)'] = df['correct'] / df['sum(correct+wrong)']
    df['ratio(correct/sum_submission_action'] = df['correct'] / df['sum_submission_action']
    #df['ratio(unique_correct/unique_sum(correct+wrong)'] = df['unique_correct'] / df['unique_sum(correct+wrong)']
    df['ratio(unique_correct/sum_unique_submission_action'] = df['unique_correct'] / df['sum_unique_submission_action']
    #df['ratio(wrong/sum(correct+wrong)'] = df['wrong'] / df['sum(correct+wrong)']
    #df['ratio(unique_wrong/unique_sum(correct+wrong)'] = df['unique_wrong'] / df['unique_sum(correct+wrong)']

    #df['ratio(unique_discovered/discovered)'] = df['unique_discovered'] / df['discovered']
    #df['ratio(unique_passed/passed)'] = df['unique_passed'] / df['passed']
    #df['ratio(unique_started_attempt/started_attempt)'] = df['unique_started_attempt'] / df['started_attempt']
    #df['ratio(unique_viewed/viewed)'] = df['unique_viewed'] / df['viewed']
    #df['ratio(unique_correct/correct)'] = df['unique_correct'] / df['correct']
    #df['ratio(unique_wrong/wrong)'] = df['unique_wrong'] / df['wrong']
    #df['ratio(event_sum(unique_action)/event_sum(action))'] = df['event_sum(unique_action)'] / df['event_sum(action)']
    #df['ratio(event_sum(action)/event_unique_periods)'] = df['event_sum(action)'] / df['event_unique_periods']
    #df['ratio(event_sum(unique_action)/event_unique_periods)'] = df['event_sum(unique_action)'] / df['event_unique_periods']
    #df['ratio(submission_sum(unique_action)/submission_sum(action))'] = df['submission_sum(unique_action)'] / df['submission_sum(action)']
    #df['ratio(submission_sum(action)/submission_unique_periods)'] = df['submission_sum(action)'] / df['submission_unique_periods']
    #df['ratio(submission_sum(unique_action)/submission_unique_periods)'] = df['submission_sum(unique_action)'] / df['submission_unique_periods']

    #df['ratio(event_timestamp_diff/event_sum(action))'] = df['event_timestamp_diff'] / df['event_sum(action)']
    #df['ratio(event_timestamp_diff/event_sum(unique_action))'] = df['event_timestamp_diff'] / df['event_sum(unique_action)']
    df['ratio(event_timestamp_diff/event_unique_periods)'] = df['event_timestamp_diff'] / df['event_unique_periods']
    #df['ratio(submission_timestamp_diff/submission_sum(action))'] = df['submission_timestamp_diff'] / df['submission_sum(action)']
    #df['ratio(submission_timestamp_diff/submission_sum(unique_action))'] = df['submission_timestamp_diff'] / df['submission_sum(unique_action)']
    df['ratio(submission_timestamp_diff/submission_unique_periods)'] = df['submission_timestamp_diff'] / df['submission_unique_periods']
    
    #df['ratio(sum(correct+wrong)/submission_sum(action))'] = df['sum(correct+wrong)'] / df['submission_sum(action)']
    #df['ratio(unique_sum(correct+wrong)/submission_sum(unique_action))'] = df['unique_sum(correct+wrong)'] / df['submission_sum(unique_action)']
    
    #df['ratio(discovered/event_timestamp_diff)'] = df['discovered'] / df['event_timestamp_diff']
    #df['ratio(unique_discovered/event_timestamp_diff)'] = df['discovered'] / df['event_timestamp_diff']
    #df['ratio(discovered/event_unique_periods)'] = df['discovered'] / df['event_unique_periods']
    #df['ratio(unique_discovered/event_unique_periods)'] = df['unique_discovered'] / df['event_unique_periods']
    #df['ratio(viewed/event_timestamp_diff)'] = df['viewed'] / df['event_timestamp_diff']
    #df['ratio(unique_viewed/event_timestamp_diff)'] = df['unique_viewed'] / df['event_timestamp_diff']
    #df['ratio(viewed/event_unique_periods)'] = df['viewed'] / df['event_unique_periods']
    #df['ratio(unique_viewed/event_unique_periods)'] = df['unique_viewed'] / df['event_unique_periods']
    #df['ratio(passed/event_timestamp_diff)'] = df['passed'] / df['event_timestamp_diff']
    #df['ratio(unique_passed/event_timestamp_diff)'] = df['unique_passed'] / df['event_timestamp_diff']
    #df['ratio(passed/event_unique_periods)'] = df['passed'] / df['event_unique_periods']
    #df['ratio(unique_passed/event_unique_periods)'] = df['unique_passed'] / df['event_unique_periods']
    #df['ratio(started_attempt/event_timestamp_diff)'] = df['started_attempt'] / df['event_timestamp_diff']
    #df['ratio(unique_started_attempt/event_timestamp_diff)'] = df['unique_started_attempt'] / df['event_timestamp_diff']
    #df['ratio(started_attempt/event_unique_periods)'] = df['started_attempt'] / df['event_unique_periods']
    #df['ratio(unique_started_attempt/event_unique_periods)'] = df['unique_started_attempt'] / df['event_unique_periods']
    df['ratio(submission_timestamp_diff/event_timestamp_diff)'] = df['submission_timestamp_diff'] / df['event_timestamp_diff']

    df['ratio(correct/submission_timestamp_diff)'] = df['correct'] / df['submission_timestamp_diff']
    #df['ratio(unique_correct/submission_timestamp_diff)'] = df['unique_correct'] / df['submission_timestamp_diff']
    df['ratio(correct/submission_unique_periods)'] = df['correct'] / df['submission_unique_periods']
    #df['ratio(unique_correct/submission_unique_periods)'] = df['unique_correct'] / df['submission_unique_periods']

    #df['ratio(wrong/submission_timestamp_diff)'] = df['wrong'] / df['submission_timestamp_diff']
    #df['ratio(unique_wrong/submission_timestamp_diff)'] = df['unique_wrong'] / df['submission_timestamp_diff']
    #df['ratio(wrong/submission_unique_periods)'] = df['wrong'] / df['submission_unique_periods']
    #df['ratio(unique_wrong/submission_unique_periods)'] = df['unique_wrong'] / df['submission_unique_periods']
    
    df = df.replace(np.inf, 0).fillna(0)
    return df

In [155]:
def get_timestamp_diff(data, pivot_table, data_type=''):
    timestamp_diff = pd.DataFrame(data.groupby('user_id').timestamp.max() - data.groupby('user_id').timestamp.min()).rename({'timestamp': f'{data_type}_timestamp_diff'}, axis=1).reset_index()

    pivot_table = pivot_table.merge(timestamp_diff, on='user_id', how='outer')
    return pivot_table

In [156]:
def get_df(event_train_pivot_table, submission_train_pivot_table):
    train_df = event_train_pivot_table.merge(submission_train_pivot_table, on='user_id', how='outer').fillna(0)
    
    return train_df

In [157]:
def add_days_and_round_days(event_pivot_table):
    event_pivot_table = event_pivot_table.merge(pd.DataFrame(event_pivot_table['event_timestamp_diff']/60/60/24)\
                                                        .rename({'event_timestamp_diff': 'days'}, axis=1)
                                                        , left_index=True, right_index=True)
    event_pivot_table = event_pivot_table.merge(pd.DataFrame(event_pivot_table['event_timestamp_diff']/60/60/24 + 1)\
                                                        .rename({'event_timestamp_diff': 'round_days'}, axis=1).round()
                                                        , left_index=True, right_index=True)
    return event_pivot_table

In [158]:
def add_action_count(data, pivot_table, data_type=''):
    pivot_table = pivot_table.merge(data[['step_id', 'user_id']]\
                                    .groupby('user_id')\
                                    .count()\
                                    .rename({'step_id': f'sum_{data_type}_action'}, axis=1)\
                                    .reset_index(),
                                    on='user_id',
                                    how='outer')
    return pivot_table

def add_unique_action_count(data, pivot_table, data_type=''):
    pivot_table = pivot_table.merge(data[['step_id', 'user_id']]\
                                    .drop_duplicates()\
                                    .groupby('user_id')\
                                    .count()\
                                    .rename({'step_id': f'sum_unique_{data_type}_action'}, axis=1)\
                                    .reset_index(),
                                    on='user_id',
                                    how='outer')
    return pivot_table

In [159]:
def add_activity_score(data, pivot_table, data_type='', seconds=3600, days_in_data=2):
    sec_in_data = days_in_data * 24*60*60
    
    temp_data = data[['user_id', 'timestamp', 'min_timestamp']]\
    .drop_duplicates().rename({'min_timestamp': '0_timestamp'}, axis=1)
    timestamp_list = [f'{n}_timestamp' for n in range(0,int(sec_in_data/seconds))]
    cnt=0
    while cnt < (sec_in_data/seconds):
        cnt+=1
        temp_data[f'{cnt}_timestamp'] = temp_data[f'{cnt-1}_timestamp'] + (seconds)
        temp_data[f'{cnt-1}_timestamp'] = (temp_data['timestamp'] >= temp_data[f'{cnt-1}_timestamp']) & (temp_data['timestamp'] < temp_data[f'{cnt}_timestamp'])
    temp_data = temp_data.drop(['timestamp', f'{cnt}_timestamp'], axis=1).drop_duplicates()
    temp_data[f'{data_type}_unique_periods'] = temp_data[timestamp_list].sum(axis=1)
    action_data = temp_data[['user_id',f'{data_type}_unique_periods']].groupby('user_id')\
                                    .sum().reset_index()
    return pivot_table.merge(action_data, on='user_id', how='outer')

In [160]:
def get_first_timestamp_diff(df, event_data, submission_data):
    df = df.merge(event_data[['user_id', 'min_timestamp']]\
                          .drop_duplicates().\
                          rename({'min_timestamp': 'first_timestamp_event'}, axis=1),
                          on='user_id',
                          how='outer')
    df = df.merge(submission_data[['user_id', 'min_timestamp']]\
                          .drop_duplicates().\
                          rename({'min_timestamp': 'first_timestamp_submission'}, axis=1),
                          on='user_id',
                          how='outer').fillna(0)
    df['first_timestamp_diff'] = df['first_timestamp_submission'] - df['first_timestamp_event']
    df = df.drop(['first_timestamp_submission', 'first_timestamp_event'], axis=1)
    return df

# Загрузка и просмотр данных train_data

In [161]:
# загрузка данных
event_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/event_data_train.zip')
submission_data = pd.read_csv('https://stepik.org/media/attachments/course/4852/submissions_data_train.zip')

In [162]:
#event_data.info()

In [163]:
#submission_data.info()

In [164]:
event_data = timestamp_to_date_day(event_data)
submission_data = timestamp_to_date_day(submission_data)

In [165]:
#event_data.groupby('day').user_id.nunique().rolling(10, min_periods=1).mean().plot();

In [166]:
#submission_data.groupby('day').user_id.nunique().rolling(10, min_periods=1).mean().plot();

# Создание целевой переменной

In [167]:
submission_target = submission_data[['step_id', 'user_id', 'submission_status']]\
                                        .pivot_table(index='user_id',
                                                     columns='submission_status',
                                                     values='step_id',
                                                     aggfunc='count')\
                                        .fillna(0)\
                                        .reset_index()

event_target = event_data[['step_id', 'user_id', 'action']]\
                                        .drop_duplicates()\
                                        .pivot_table(index='user_id',
                                                     columns='action',
                                                     values='step_id',
                                                     aggfunc='count')\
                                        .fillna(0)\
                                        .reset_index()
event_target['passed_course'] = event_target['passed'] >=40
submission_target['passed_course'] = submission_target['correct'] >=40
target = submission_target

In [168]:
target

submission_status,user_id,correct,wrong,passed_course
0,2,2.0,0.0,False
1,3,29.0,23.0,False
2,5,2.0,2.0,False
3,8,9.0,21.0,False
4,14,0.0,1.0,False
...,...,...,...,...
9935,26787,3.0,0.0,False
9936,26790,1.0,0.0,False
9937,26794,33.0,9.0,False
9938,26797,2.0,0.0,False


# Отбор первых двух дней

*   event_data_train
*   submission_data_train




In [169]:
event_data_train = n_days_selection(event_data)
submission_data_train = n_days_selection(submission_data)

# Создание pivot_table

*   event_train_pivot_table
*   submission_train_pivot_table

In [170]:
event_train_pivot_table, submission_train_pivot_table = get_pivot_table(event_data_train, submission_data_train)

# Добавление в train_pivot_table

суммы действий и уникальных действий, activity_score

In [171]:
event_train_pivot_table = add_action_count(event_data_train, 
                                             event_train_pivot_table, 
                                             data_type='event')
submission_train_pivot_table = add_action_count(submission_data_train, 
                                                  submission_train_pivot_table, 
                                                  data_type ='submission')
event_train_pivot_table = add_unique_action_count(event_data_train, 
                                             event_train_pivot_table, 
                                             data_type='event')
submission_train_pivot_table = add_unique_action_count(submission_data_train, 
                                                  submission_train_pivot_table, 
                                                  data_type ='submission')

In [172]:
event_train_pivot_table = add_activity_score(event_data_train, 
                                             event_train_pivot_table, 
                                             seconds=300, 
                                             data_type='event')
submission_train_pivot_table = add_activity_score(submission_data_train, 
                                                  submission_train_pivot_table, 
                                                  seconds=300,
                                                  data_type ='submission')

  # Remove the CWD from sys.path while we load stuff.


# Добавление значений для уникальных шагов для пользователя

In [173]:
# Добавление unique_steps к submission_train_pivot_table
for col_name in ['correct', 'wrong']:
    submission_train_pivot_table = add_unique_steps(submission_data_train, 
                                                    submission_train_pivot_table,
                                                    'submission_status',
                                                    col_name)

In [174]:
# Добавление unique_steps к event_train_pivot_table
for col_name in ['viewed', 'passed', 'discovered', 'started_attempt']:
    event_train_pivot_table = add_unique_steps(event_data_train, 
                                                event_train_pivot_table,
                                                'action',
                                                col_name)


# Обработка timestamp

*   event_train_pivot_table 
*   submission_train_pivot_table

добавлена разница между последним и первым timestamp



In [175]:
event_train_pivot_table = get_timestamp_diff(event_data_train, event_train_pivot_table, data_type='event')
submission_train_pivot_table = get_timestamp_diff(submission_data_train, submission_train_pivot_table, data_type='submission')

Добавим количество дней и округленных дней

In [176]:
event_train_pivot_table = add_days_and_round_days(event_train_pivot_table)

# Визуализация train data

In [177]:
#event_train_pivot_table.describe()

In [178]:
#event_train_pivot_table.discovered.plot()

In [179]:
#event_train_pivot_table.passed.plot()

In [180]:
#event_train_pivot_table.started_attempt.plot()

In [181]:
#event_train_pivot_table.viewed.plot()

In [182]:
#submission_train_pivot_table.describe()

In [183]:
#submission_train_pivot_table.correct.plot()

In [184]:
#submission_train_pivot_table.unique_correct.plot()

In [185]:
#submission_train_pivot_table.wrong.plot()

In [186]:
submission_train_pivot_table

Unnamed: 0,user_id,correct,wrong,sum_submission_action,sum_unique_submission_action,submission_unique_periods,unique_correct,unique_wrong,submission_timestamp_diff
0,2,2.0,0.0,2,2,1,2.0,0.0,65
1,3,4.0,4.0,8,4,3,4.0,2.0,1040
2,5,2.0,2.0,4,2,1,2.0,1.0,80
3,8,9.0,21.0,30,11,3,9.0,7.0,711
4,14,0.0,1.0,1,1,1,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...
9935,26787,3.0,0.0,3,3,1,3.0,0.0,245
9936,26790,1.0,0.0,1,1,1,1.0,0.0,0
9937,26794,24.0,7.0,31,22,21,22.0,5.0,105295
9938,26797,2.0,0.0,2,2,2,2.0,0.0,461


# Сборка train_df

In [187]:
train_df = get_df(event_train_pivot_table, submission_train_pivot_table)

# Визуализация train_df

Визуализация по passed_course

In [188]:
#sns.lmplot(data = train_df, x='discovered', y='viewed', hue='passed_course', height = 7, aspect = 1.5)

In [189]:
#sns.lmplot(data = train_df, x='discovered', y='passed', hue='passed_course', height = 7, aspect = 1.5)

In [190]:
#sns.lmplot(data = train_df, x='started_attempt', y='unique_correct', hue='passed_course', height = 7, aspect = 1.5)

In [191]:
#sns.lmplot(data = train_df, x='started_attempt', y='correct', hue='passed_course', height = 7, aspect = 1.5)

In [192]:
#sns.lmplot(data = train_df, x='submission_total', y='unique_correct', hue='passed_course', height = 7, aspect = 1.5)

In [193]:
#sns.lmplot(data = train_df, x='submission_total', y='correct', hue='passed_course', height = 7, aspect = 1.5)

In [194]:
#sns.lmplot(data = train_df, x='event_timestamp_diff', y='discovered', hue='passed_course', height = 7, aspect = 1.5)

In [195]:
#sns.lmplot(data = train_df, x='event_timestamp_diff', y='passed', hue='passed_course', height = 7, aspect = 1.5)

In [196]:
#sns.lmplot(data = train_df, x='event_timestamp_diff', y='viewed', hue='passed_course', height = 7, aspect = 1.5)

In [197]:
#sns.lmplot(data = train_df, x='correct', y='unique_correct', hue='passed_course', height = 7, aspect = 1.5)

In [198]:
#sns.lmplot(data = train_df, x='submission_timestamp_diff', y='unique_correct', hue='passed_course', height = 7, aspect = 1.5)

In [199]:
#sns.lmplot(data = train_df, x='submission_timestamp_diff', y='correct', hue='passed_course', height = 7, aspect = 1.5)

In [200]:
#sns.lmplot(data = train_df, x='submission_timestamp_diff', y='started_attempt', hue='passed_course', height = 7, aspect = 1.5)

In [201]:
#sns.lmplot(data = train_df, x='submission_timestamp_diff', y='submission_total', hue='passed_course', height = 7, aspect = 1.5)

# Создание отношений переменных для train_df

In [202]:
train_df = get_feature(train_df)

# Добавление passed_course и first_timestamp

In [203]:
train_df = get_first_timestamp_diff(train_df, event_data_train, submission_data_train)

In [204]:
train_df = train_df.merge(target[['user_id','passed_course']], on='user_id', how='outer').fillna(False)

# Запись train_df

In [205]:
train_df.shape

(19234, 39)

In [206]:
path = '/content/drive/MyDrive/Stepic_ML/Stepic_contest/df/train_df.csv'
with open(path, 'w+', encoding = 'utf-8-sig') as f:
    train_df.to_csv(f, index=False)

# Загрузка и обработка test_data 

In [207]:
# загрузка данных
event_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
submission_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')

In [208]:
event_data_test = timestamp_to_date_day(event_data_test)
submission_data_test = timestamp_to_date_day(submission_data_test)

# Добавление min_timestamp

In [209]:
event_data_test = n_days_selection(event_data_test)
submission_data_test = n_days_selection(submission_data_test)

# Создание pivot_table

*   event_test_pivot_table
*   submission_test_pivot_table



In [210]:
event_test_pivot_table, submission_test_pivot_table = get_pivot_table(event_data_test, submission_data_test)

In [211]:
event_test_pivot_table = add_action_count(event_data_test, 
                                             event_test_pivot_table, 
                                             data_type='event')
submission_test_pivot_table = add_action_count(submission_data_test, 
                                                  submission_test_pivot_table, 
                                                  data_type ='submission')
event_test_pivot_table = add_unique_action_count(event_data_test, 
                                             event_test_pivot_table, 
                                             data_type='event')
submission_test_pivot_table = add_unique_action_count(submission_data_test, 
                                                  submission_test_pivot_table, 
                                                  data_type ='submission')

In [212]:
event_test_pivot_table = add_activity_score(event_data_test, 
                                            event_test_pivot_table, 
                                            seconds=300,
                                            data_type='event')
submission_test_pivot_table = add_activity_score(submission_data_test, 
                                                  submission_test_pivot_table, 
                                                  seconds=300,
                                                  data_type ='submission')

  # Remove the CWD from sys.path while we load stuff.


# Добавление значений для уникальных шагов для пользователя

In [213]:
# Добавление unique_steps к submission_test_pivot_table
for col_name in ['correct', 'wrong']:
    submission_test_pivot_table = add_unique_steps(submission_data_test, 
                                                    submission_test_pivot_table,
                                                    'submission_status',
                                                    col_name)

In [214]:
# Добавление unique_steps к event_test_pivot_table
for col_name in ['viewed', 'passed', 'discovered', 'started_attempt']:
    event_test_pivot_table = add_unique_steps(event_data_test, 
                                                    event_test_pivot_table,
                                                    'action',
                                                    col_name)


# Обработка timestamp

*   event_train_pivot_table 
*   submission_train_pivot_table

добавлена разница между последним и первым timestamp



In [215]:
event_test_pivot_table = get_timestamp_diff(event_data_test, event_test_pivot_table, data_type='event')
submission_test_pivot_table = get_timestamp_diff(submission_data_test, submission_test_pivot_table, data_type='submission')

In [216]:
event_test_pivot_table = add_days_and_round_days(event_test_pivot_table)

# Сборка test_df

In [217]:
test_df = get_df(event_test_pivot_table, submission_test_pivot_table)

In [218]:
test_df.shape

(6184, 23)

# Создание отношений переменных для test_df

In [219]:
test_df = get_feature(test_df)

# Добавление first_timestamp

In [220]:
test_df = get_first_timestamp_diff(test_df, event_data_test, submission_data_test)

# Запись test_df

In [221]:
test_df.shape

(6184, 38)

In [222]:
path = '/content/drive/MyDrive/Stepic_ML/Stepic_contest/df/test_df.csv'
with open(path, 'w+', encoding = 'utf-8-sig') as f:
    test_df.to_csv(f, index=False)