In [1]:
import numpy as np
import pandas as pd

In [6]:
input_data = pd.DataFrame([
    [1, 5, 12, 4],
    [1, 5, 16, 5],
    [1, 5, 20, 6],
    [1, 5, 8, 3],
    [1, 5, 10, 3.5],
    [1, 5, 22, 6.5],
    [2, 8, 12, 44],
    [2, 8, 10, 33],
    [2, 8, 14, 50],
    [2, 8, 8, 15],
    [2, 8, 0, 0],
    [2, 8, 3, -5]
], columns = ['id', 'constant_feature', 'time_dependent_feature', 'target_variable'])

input_data.head()

Unnamed: 0,id,constant_feature,time_dependent_feature,target_variable
0,1,5,12,4.0
1,1,5,16,5.0
2,1,5,20,6.0
3,1,5,8,3.0
4,1,5,10,3.5


In [7]:
input_data = input_data.set_index('id')

In [8]:
unique_ids = input_data.index.unique()
unique_ids

Int64Index([1, 2], dtype='int64', name='id')

In [9]:
input_data.head(20)

Unnamed: 0_level_0,constant_feature,time_dependent_feature,target_variable
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5,12,4.0
1,5,16,5.0
1,5,20,6.0
1,5,8,3.0
1,5,10,3.5
1,5,22,6.5
2,8,12,44.0
2,8,10,33.0
2,8,14,50.0
2,8,8,15.0


In [10]:
aa = input_data.loc[1][['target_variable', 'time_dependent_feature']]
aa.columns = ['y', 'time_dependent_feature']
aa.head()

Unnamed: 0_level_0,y,time_dependent_feature
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.0,12
1,5.0,16
1,6.0,20
1,3.0,8
1,3.5,10


In [11]:
for i in range(1, 6): #3
    aa['target_lag_{}'.format(i)] = aa['y'].shift(i)

aa.head()

Unnamed: 0_level_0,y,time_dependent_feature,target_lag_1,target_lag_2,target_lag_3,target_lag_4,target_lag_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4.0,12,,,,,
1,5.0,16,4.0,,,,
1,6.0,20,5.0,4.0,,,
1,3.0,8,6.0,5.0,4.0,,
1,3.5,10,3.0,6.0,5.0,4.0,


In [12]:
for i in range(0, 6): #4
    aa['time_dependent_feature_lag_{}'.format(i)] = aa.time_dependent_feature.shift(i)
aa.head()

Unnamed: 0_level_0,y,time_dependent_feature,target_lag_1,target_lag_2,target_lag_3,target_lag_4,target_lag_5,time_dependent_feature_lag_0,time_dependent_feature_lag_1,time_dependent_feature_lag_2,time_dependent_feature_lag_3,time_dependent_feature_lag_4,time_dependent_feature_lag_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,4.0,12,,,,,,12,,,,,
1,5.0,16,4.0,,,,,16,12.0,,,,
1,6.0,20,5.0,4.0,,,,20,16.0,12.0,,,
1,3.0,8,6.0,5.0,4.0,,,8,20.0,16.0,12.0,,
1,3.5,10,3.0,6.0,5.0,4.0,,10,8.0,20.0,16.0,12.0,


In [14]:
X = []
Y = []

for identifier in unique_ids:
    single_process_data = input_data.loc[identifier] #1
    
    data = pd.DataFrame(single_process_data[['target_variable', 'time_dependent_feature']].copy()) #2
    data.columns = ['y', 'time_dependent_feature'] #2

    # last 5 values of the target variable as "lag" variables (the most recent one is the dependent feature (y))
    for i in range(1, 6): #3
        data['target_lag_{}'.format(i)] = data.y.shift(i)
        
    # last 6 values of the target variable as "time_dependent_feature" variables
    for i in range(0, 6): #4
        data['time_dependent_feature_lag_{}'.format(i)] = data.time_dependent_feature.shift(i)
    
    #rewrite constants
    data['constant_feature'] = single_process_data['constant_feature'] #5

    #the shift operations in the loops create many partial results. They are useless, and we don't want them
    data = data.dropna()
    y = data.y #6
    x = data.drop(['y', 'time_dependent_feature'], axis=1) #6
    
    X.append(np.array(x).flatten()) #7
    Y.append(y) #7

#### new exaple

In [3]:
np.random.seed(0) # ensures the same set of random numbers are generated
date = ['2019-01-01']*3 + ['2019-01-02']*3 + ['2019-01-03']*3
var1, var2 = np.random.randn(9), np.random.randn(9)*20 
group = ["group1", "group2", "group3"]*3 # to assign the groups for the multiple group case

df_manygrp = pd.DataFrame({"date": date, "group":group, "var1": var1}) # one var, many groups
df_combo = pd.DataFrame({"date": date, "group":group, "var1": var1, "var2": var2}) # many vars, many groups
df_onegrp = df_manygrp[df_manygrp["group"]=="group1"] # one var, one group

In [4]:
df_manygrp.head()

Unnamed: 0,date,group,var1
0,2019-01-01,group1,1.764052
1,2019-01-01,group2,0.400157
2,2019-01-01,group3,0.978738
3,2019-01-02,group1,2.240893
4,2019-01-02,group2,1.867558


In [5]:
df_onegrp.head()

Unnamed: 0,date,group,var1
0,2019-01-01,group1,1.764052
3,2019-01-02,group1,2.240893
6,2019-01-03,group1,0.950088


In [6]:
df_combo.head()

Unnamed: 0,date,group,var1,var2
0,2019-01-01,group1,1.764052,8.21197
1,2019-01-01,group2,0.400157,2.880871
2,2019-01-01,group3,0.978738,29.08547
3,2019-01-02,group1,2.240893,15.220755
4,2019-01-02,group2,1.867558,2.4335


In [13]:
for d in [df_onegrp, df_manygrp, df_combo]: # loop to apply the change to both dfs
    d["date"] = pd.to_datetime(d['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["date"] = pd.to_datetime(d['date'])


In [14]:
for d in [df_onegrp, df_manygrp, df_combo]: # loop to apply the change to both dfs
    values = pd.to_datetime(d['date']).copy()
    d.loc[:,"date"]=values

In [20]:
df_combo.head(20)

Unnamed: 0,date,group,var1,var2
0,2019-01-01,group1,1.764052,8.21197
1,2019-01-01,group2,0.400157,2.880871
2,2019-01-01,group3,0.978738,29.08547
3,2019-01-02,group1,2.240893,15.220755
4,2019-01-02,group2,1.867558,2.4335
5,2019-01-02,group3,-0.977278,8.877265
6,2019-01-03,group1,0.950088,6.673487
7,2019-01-03,group2,-0.151357,29.881581
8,2019-01-03,group3,-0.103219,-4.103165


In [21]:
grouped_df = df_combo.groupby(["group"])

In [24]:
grouped_df

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000202F674B940>

In [27]:
# grouped_df.group??

In [25]:
grouped_df.groups

{'group1': [0, 3, 6], 'group2': [1, 4, 7], 'group3': [2, 5, 8]}

In [26]:
grouped_df.groups.keys()

dict_keys(['group1', 'group2', 'group3'])

In [33]:
grouped_df.get_group('group1')
# it is dataframe

Unnamed: 0,date,group,var1,var2
0,2019-01-01,group1,1.764052,8.21197
3,2019-01-02,group1,2.240893,15.220755
6,2019-01-03,group1,0.950088,6.673487


In [18]:
def lag_by_group(key, value_df):
    df = value_df.assign(group = key) # this pandas method returns a copy of the df, with group columns assigned the key value
    return (df.sort_values(by=["date"], ascending=True)
        .set_index(["date"])
        .shift(1)
               ) # the parenthesis allow you to chain methods and avoid intermediate variable assignment

In [34]:
dflist = [lag_by_group(g, grouped_df.get_group(g)) for g in grouped_df.groups.keys()]
dflist

[             group      var1       var2
 date                                   
 2019-01-01     NaN       NaN        NaN
 2019-01-02  group1  1.764052   8.211970
 2019-01-03  group1  2.240893  15.220755,
              group      var1      var2
 date                                  
 2019-01-01     NaN       NaN       NaN
 2019-01-02  group2  0.400157  2.880871
 2019-01-03  group2  1.867558  2.433500,
              group      var1       var2
 date                                   
 2019-01-01     NaN       NaN        NaN
 2019-01-02  group3  0.978738  29.085470
 2019-01-03  group3 -0.977278   8.877265]

In [36]:
pd.concat(dflist, axis=0).reset_index()

Unnamed: 0,date,group,var1,var2
0,2019-01-01,,,
1,2019-01-02,group1,1.764052,8.21197
2,2019-01-03,group1,2.240893,15.220755
3,2019-01-01,,,
4,2019-01-02,group2,0.400157,2.880871
5,2019-01-03,group2,1.867558,2.4335
6,2019-01-01,,,
7,2019-01-02,group3,0.978738,29.08547
8,2019-01-03,group3,-0.977278,8.877265


In [41]:
''.join(('a','t'))

'at'

In [39]:
def lag_by_group(key, value_df):
    df = value_df.assign(group_new_name = key) # this pandas method returns a copy of the df, with group (NAAM KE) columns assigned the key value
    print(df.head())
    return

In [40]:
lag_by_group('group1', grouped_df.get_group('group1'))

        date   group      var1       var2 group_new_name
0 2019-01-01  group1  1.764052   8.211970         group1
3 2019-01-02  group1  2.240893  15.220755         group1
6 2019-01-03  group1  0.950088   6.673487         group1


In [None]:
dflist_2 = [lag_by_group(g, grouped_df.get_group(g)) for g in grouped_df.groups.keys()]
dflist_2

In [None]:
pd.concat(dflist, axis=0).reset_index()