# Notebook Prep

In [1]:
#import libraries

import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

In [2]:
#Define variable: 'path'

path = r'C:\Users\PC Planet\Desktop\Self-Education\Data Immersion\Achievement 4\Instacart Basket Analysis'

In [3]:
#Import test dataframe

df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'dataframe_after_exclusion.pkl'))

In [4]:
#Create subset of test dataframe

df2 = df[:1000]

# Test Code

In [5]:
#Sort dataframe by user_id and order_number

df2 = df2.sort_values(['user_id','order_number'])

In [6]:
#Reset index

df2.reset_index(drop = True, inplace = True)

In [7]:
#Add and fill income_profile column

    
df2.loc[df2['income'] < 32500, 'income_profile'] = 'Struggling '

df2.loc[(df2['income'] >= 32500) & (df2 ['income'] < 65000), 'income_profile'] = 'Lower-Middle Class '

df2.loc[(df2['income'] >= 65000) & (df2['income'] < 97500), 'income_profile'] = 'Upper-Middle Class '

df2.loc[(df2['income'] >= 97500) & (df2['income'] < 195000), 'income_profile'] = 'High-Class '

df2.loc[df2['income'] >= 195000, 'income_profile'] = 'Wealthy '



df2['income_profile'].value_counts()

High-Class             556
Lower-Middle Class     444
Name: income_profile, dtype: int64

In [8]:
#Add and fill age_profile column

df2.loc[df2['age'] < 30, 'age_profile'] = 'Young '

df2.loc[(df2['age'] >= 30) & (df2['age'] < 45), 'age_profile'] = 'Adult '

df2.loc[(df2['age'] >= 45) & (df2['age'] < 65), 'age_profile'] = 'Middle-Aged '

df2.loc[df2['age'] >= 65, 'age_profile'] = 'Elderly '


df2['age_profile'].value_counts()

Adult           505
Elderly         277
Middle-Aged     218
Name: age_profile, dtype: int64

In [9]:
#Create users_alcohol subset

users_alcohol = df2.query("department_id == 5")

In [10]:
#Create unique users list

unique_alcohol = np.unique(users_alcohol['user_id'])

In [30]:
#Add and fill alcohol_profile column

df2.loc[df2['user_id'].isin(unique_alcohol), 'alcohol_profile'] = 'Alcohol-Consuming '

df2.loc[~df2['user_id'].isin(unique_alcohol), 'alcohol_profile'] = 'Abstinent '

df2['alcohol_profile'].value_counts()

Abstinent     1000
Name: alcohol_profile, dtype: int64

In [14]:
#Create users_diet subsets

users_meat = df2.query("department_id == 12")

users_dairy = df2.query("department_id == 16")

users_produce = df2.query("department_id == 4")

In [32]:
#Create unique users list

unique_meat = np.unique(users_meat['user_id'])

unique_dairy = np.unique(users_dairy['user_id'])

unique_produce = np.unique(users_produce['user_id'])

In [33]:
#Add and fill diet_profile column

df2.loc[df2['user_id'].isin(unique_meat), 'diet_profile'] = 'Carnivorous '

df2.loc[(~df2['user_id'].isin(unique_meat)) & (df2['user_id'].isin(unique_dairy)), 'diet_profile'] = 'Vegetarian '

df2.loc[(~df2['user_id'].isin(unique_meat)) & (~df2['user_id'].isin(unique_dairy)) & (df2['user_id'].isin(unique_produce)), 'diet_profile'] = 'Vegan '

df2.loc[(~df2['user_id'].isin(unique_meat)) & (~df2['user_id'].isin(unique_dairy)) & (~df2['user_id'].isin(unique_produce)), 'diet_profile'] = ''

df2['diet_profile'].value_counts()

Carnivorous     650
Vegetarian      264
                 86
Name: diet_profile, dtype: int64

In [22]:
#Add and fill marital_profile column

df2.loc[df2['marital_status']=='married', 'marital_profile'] = 'Married '

df2.loc[df2['marital_status'] == 'divorced/widowed', 'marital_profile'] = 'Divorced/Widowed '

df2.loc[(df2['marital_status'] != 'married') & (df2['marital_status'] != 'divorced/widowed'), 'marital_profile'] = 'Single '

df2['marital_profile'].value_counts()

Married              723
Divorced/Widowed     277
Name: marital_profile, dtype: int64

In [23]:
#Add and fill dependants_profile column

df2.loc[df2['dependants'] == 0, 'dependants_profile'] = ''

df2.loc[df2['dependants'] > 0, 'dependants_profile'] = ' with Children'


df2['dependants_profile'].value_counts()

 with Children    723
                  277
Name: dependants_profile, dtype: int64

In [24]:
#Create users_pets subset

users_pets = df2.query("department_id == 8")

In [36]:
#Create unique users list

unique_pets = np.unique(users_pets['user_id'])

In [37]:
#Add and fill pets_profile column

df2.loc[(df2['user_id'].isin(unique_pets)) & (df2['dependants'] == 0), 'pets_profile'] = 'with Pets'

df2.loc[(df2['user_id'].isin(unique_pets)) & (df2['dependants'] > 0), 'pets_profile'] = ' and Pets'

df2.loc[~df2['user_id'].isin(unique_pets), 'pets_profile'] = ''

df2['pets_profile'].value_counts()

    1000
Name: pets_profile, dtype: int64

In [26]:
df2['assumed_profile'] = df2['income_profile'] + df2['age_profile'] + df2['alcohol_profile'] + df2['diet_profile'] + df2['marital_profile'] + df2['gender'] + df2['dependants_profile'] + df2['pets_profile']

df2['assumed_profile'].value_counts()

Lower-Middle Class Adult Alcohol-Consuming Married Female with Children    299
High-Class Elderly Alcohol-Consuming Divorced/Widowed Male                 205
High-Class Middle-Aged Abstinent Married Female with Children              204
High-Class Adult Abstinent Married Female with Children                    147
Lower-Middle Class Elderly Abstinent Divorced/Widowed Female                72
Lower-Middle Class Adult Abstinent Married Female with Children             59
Lower-Middle Class Middle-Aged Abstinent Married Female with Children       14
Name: assumed_profile, dtype: int64

In [27]:
#Delete intermediate profile columns

del df2['income_profile']
del df2['age_profile']
del df2['alcohol_profile']
del df2['diet_profile']
del df2['marital_profile']
del df2['dependants_profile']
del df2['pets_profile']

In [28]:
df2.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,gender,state,region,age,date_joined,dependants,marital_status,income,exclusion_flag,assumed_profile
0,2539329,1,1,2,8,,196,1,0,Soda,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,Lower-Middle Class Adult Abstinent Married Fem...
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,Lower-Middle Class Adult Abstinent Married Fem...
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,Lower-Middle Class Adult Abstinent Married Fem...
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,Lower-Middle Class Adult Abstinent Married Fem...
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,Lower-Middle Class Adult Abstinent Married Fem...


# Check Success

In [None]:
#Add same_user flag

df2['same_user'] = np.where(df2['user_id'].eq(df2['user_id'].shift()), True, False)

In [34]:
profile = []

In [41]:
#Nested For-Loops to Add Strings to 'profile' list

for index, row in df2.iterrows():

    if row['same_user'] == 'False':
        profile.clear()

#Income conditional

        income = row['income']
    
        if income < 32500:
            profile.append('Struggling')
        elif (income >= 32500) & (income < 65000):
            profile.append('Lower-Middle Class')
        elif (income >= 65000) & (income < 97500):
            profile.append('Upper-Middle Class')
        elif (income >= 97500) & (income < 195000):
            profile.append('High-Class')
        elif (income >= 195000):
            profile.append('Wealthy')
        else: pass

#Age conditional

        age = row['age']
    
        if age < 30:
            profile.append('Young')
        elif (age >= 30) & (age < 45):
            profile.append('Adult')
        elif (age >= 45) & (age < 65):
            profile.append('Middle-Aged')
        elif age >= 65:
            profile.append('Elderly')
        else: pass

#Alcohol conditional and loop

    #Create subset based on current 'user_id'
        
        df_user = df2[df2['user_id']==row['user_id']]
        
    #Create second subset based on first subset and reduce to unique rows
        
        df_time = df_user[['order_number', 'days_since_prior_order']]
        df_time.drop_duplicates(inplace = True)
        df_time.reset_index(inplace = True)
            
    #Define initial variables
            
        last_order = df_time['order_number'].max()
        day_limit = df_time.loc[df_time['order_number'] == last_order, 'days_since_prior_order']
        earliest_order = last_order
        number_of_orders = 1
                    
    #While loop to count orders over last 150 days and iterate variables
                
        while all((day_limit <= 150) & (earliest_order > 1)):
            df_time.drop(df_time.tail(1).index, inplace = True)       
            earliest_order = df_time['order_number'].max()
            day_limit = day_limit + df_time.loc[df_time['order_number'] == earliest_order, 'days_since_prior_order']
            number_of_orders = number_of_orders + 1
            
    #Drop rows outside of 150-day range 
        
        df_user.drop(df_user[df_user['order_number'] < earliest_order].index, inplace = True)
        df_user.reset_index(inplace = True)
        
    #Determine amount of alcohol purchased
        
        if '5' in df_user['department_id']:
            alcohol_purchased = df_user['department_id'].value_counts()['5']
        else: alcohol_purchased = 0
            
    #Calculate mean alcohol purchases per order
        
        alcohol_mean = alcohol_purchased / number_of_orders 
            
    #Determine appropriate appendage
            
        if alcohol_purchased == 0:
            profile.append('Abstinent')
        elif alcohol_mean < 3:
            profile.append('Casual-Drinking')
        elif alcohol_mean >= 3:
            profile.append('Heavy-Drinking')

#Diet Loop
    
    #Determine amount of produce purchased
        
        if '4' in df_user['department_id']:
            produce = df_user['department_id'].value_counts()['4']
        else: produce = 0 
        
    #Determine amount of dairy purchased
        
        if '16' in df_user['department_id']:
            dairy = df_user['department_id'].value_counts()['16']
        else: dairy = 0
            
    #Determine amount of meat purchased
        
        if '12' in df_user['department_id']:
            meat = df_user['department_id'].value_counts()['12']
        else: meat = 0
        
    #Meat mean
        
        meat_mean = meat / number_of_orders
                
    #Determine appropriate appendage
        
        if (produce > 0) & (dairy == 0) & (meat == 0):
            profil.append('Vegan')
        elif (produce > 0) & (meat == 0):
            profile.append('Vegetarian')
        elif meat_mean > 5:
            profile.append('Carnivorous')
        else: pass

#Marital Status conditional

        if row['marital_status']=='married':
            profile.append('Married')
        elif row['marital_status']=='divorced/widowed':
            profile.append('Divorced/Widowed')
        else: profile.append('Single')

#Gender conditional
    
        if row['gender']=='male':
            profile.append('Male')
        else: profile.append('Female')

#Dependants Loop

        dependants = row['dependants']
    
        if '18' in df_user['department_id']:
            babies = df_user['department_id'].value_counts()['18']
        else: babies = 0
            
        babies_mean = babies / number_of_orders
    
        if dependants == 0:
            dependants_append = ''
        elif (dependants > 0) & (babies_mean >= 1):
            dependants_append = 'with Infant Children'
        else: dependants_append = 'with Children'
            
        profile.append(dependants_append)

#Pets
    
    #Determine amount of pet products purchased
        
        if '8' in df_user['department_id']:
            pets = df_user['department_id'].value_counts()['8']
        else: pets = 0
        
    #Append pets profile
        
        if pets >= 2:
            if dependants_append == '':
                profile.append('with Pets')
            else: profile.append('and Pets')
            
        else: pass
        
        df2['assumed_profile'] = profile

    
#Append old profile if same user
    
    else: pass

In [42]:
df2.head(30)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,gender,state,region,age,date_joined,dependants,marital_status,income,exclusion_flag,same_user
0,2539329,1,1,2,8,,196,1,0,Soda,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,False
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
5,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
6,2398795,1,2,3,7,15.0,12427,3,1,Original Beef Jerky,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
7,2398795,1,2,3,7,15.0,26088,5,1,Aged White Cheddar Popcorn,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
8,2398795,1,2,3,7,15.0,10258,2,0,Pistachios,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
9,2398795,1,2,3,7,15.0,13176,4,0,Bag of Organic Bananas,...,Female,Alabama,South,31,2/17/2019,3,married,40423,False,True
