In [1]:
import pandas as pd
import re
import glob

In [2]:
#Import groups of customer data
appended_data = []
for file in glob.glob('Cust*'):
    data = pd.read_csv(file)
    appended_data.append(data)
cust_df = pd.concat(appended_data)
cust_df.head()

Unnamed: 0,owner_no,First Order Date,First Contribution Date,postal_code,state_desc,geo_area_desc,OP Prelim Capacity,LTV Tkt Value
0,2307895,2014-09-02 00:00:00,2015-03-17 00:00:00,33134,Florida,7-USA Balance,8,29997.0
1,1601821,2014-03-11 00:00:00,,19382,Pennsylvania,2-Greater Philadelphia(70 mi.),3,14400.7
2,2249854,2014-05-10 00:00:00,2014-05-13 00:00:00,19119,Pennsylvania,1-Philadelphia City (20 mi.),5,11584.0
3,2149845,2014-08-18 00:00:00,2015-12-04 00:00:00,18940,Pennsylvania,2-Greater Philadelphia(70 mi.),4,9634.0
4,2052298,2014-08-19 00:00:00,2015-12-30 00:00:00,19702,Delaware,2-Greater Philadelphia(70 mi.),4,8475.0


In [3]:
#Import groups of order data
appended_data = []
for file in glob.glob('Ord*'):
    data = pd.read_csv(file)
    appended_data.append(data)
order_df = pd.concat(appended_data)
order_df.head()

Unnamed: 0,owner_no,order_dt,Count of order_no,channel_desc,MOS_desc,delivery_desc,tot_ticket_paid_amt,tot_contribution_paid_amt,facility_desc,prod_season_desc,num_seats_pur
0,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Academy of Music,Don Carlo,2
1,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Academy of Music,Oscar,2
2,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Academy of Music,The Barber of Seville,2
3,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,General Admission,40th Anniversary Voucher,2
4,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Perelman,Ariadne auf Naxos,2


In [4]:
df = pd.merge(order_df, cust_df, how='left', on = 'owner_no', validate='many_to_one')
df.head()

Unnamed: 0,owner_no,order_dt,Count of order_no,channel_desc,MOS_desc,delivery_desc,tot_ticket_paid_amt,tot_contribution_paid_amt,facility_desc,prod_season_desc,num_seats_pur,First Order Date,First Contribution Date,postal_code,state_desc,geo_area_desc,OP Prelim Capacity,LTV Tkt Value
0,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Academy of Music,Don Carlo,2,,,,,,,
1,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Academy of Music,Oscar,2,,,,,,,
2,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Academy of Music,The Barber of Seville,2,,,,,,,
3,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,General Admission,40th Anniversary Voucher,2,,,,,,,
4,18251,2014-04-24 00:00:00,1,Phone,Ticketing,OP - US Mail,$1044,$6000,Perelman,Ariadne auf Naxos,2,,,,,,,


### Data Type Cleanup ###

In [5]:
#Create datetime data types
df.order_dt=pd.to_datetime(df.order_dt, errors='coerce')
df['First Order Date'] = pd.to_datetime(df['First Order Date'], errors='coerce')
df['First Contribution Date'] = pd.to_datetime(df['First Contribution Date'], errors='coerce')

In [8]:
#Create categorical data types
df.channel_desc = df.channel_desc.astype('category')
df.MOS_desc = df.MOS_desc.astype('category')
df.delivery_desc = df.delivery_desc.astype('category')
df.postal_code = df.postal_code.astype('category')
df.state_desc = df.state_desc.astype('category')
df['OP Prelim Capacity'] = df['OP Prelim Capacity'].astype('category')
df.facility_desc = df.facility_desc.astype('category')
df.prod_season_desc = df.prod_season_desc.astype('category')
df.geo_area_desc = df.geo_area_desc.str.split('-')[0]

In [9]:
#Create numerical data types
df.tot_ticket_paid_amt = df.tot_ticket_paid_amt.str.replace('$','')
df.tot_contribution_paid_amt = df.tot_contribution_paid_amt.str.replace('$','')
df.tot_ticket_paid_amt = pd.to_numeric(df.tot_ticket_paid_amt, errors='coerce')
df.tot_contribution_paid_amt = pd.to_numeric(df.tot_contribution_paid_amt, errors='coerce')

### Additional Data Cleanup ###

In [10]:
#Drop helper column
df = df.drop('Count of order_no', axis=1)

In [11]:
#Clean up column headers
df = df.rename(
    columns={
        'First Order Date': 'first_order_dt',
        'First Contribution Date': 'first_cont_dt',
        'OP Prelim Capacity': 'prelim_capacity',
        'LTV Tkt Value': 'ltv_tkt_value'
    }
     )

In [12]:
#Incorporate flag for orders in which customer made their first contribution with their first order
df.loc[df.first_order_dt == df.first_cont_dt, 'first_cont_order'] = 1
df.loc[df.first_order_dt != df.first_cont_dt, 'first_cont_order'] = 0

0.0    137764
1.0      1436
Name: first_cont_order, dtype: int64

In [13]:
df.head()

Unnamed: 0,owner_no,order_dt,channel_desc,MOS_desc,delivery_desc,tot_ticket_paid_amt,tot_contribution_paid_amt,facility_desc,prod_season_desc,num_seats_pur,first_order_dt,first_cont_dt,postal_code,state_desc,geo_area_desc,prelim_capacity,ltv_tkt_value,first_cont_order
0,18251,2014-04-24,Phone,Ticketing,OP - US Mail,1044.0,6000.0,Academy of Music,Don Carlo,2,NaT,NaT,,,,,,0.0
1,18251,2014-04-24,Phone,Ticketing,OP - US Mail,1044.0,6000.0,Academy of Music,Oscar,2,NaT,NaT,,,,,,0.0
2,18251,2014-04-24,Phone,Ticketing,OP - US Mail,1044.0,6000.0,Academy of Music,The Barber of Seville,2,NaT,NaT,,,,,,0.0
3,18251,2014-04-24,Phone,Ticketing,OP - US Mail,1044.0,6000.0,General Admission,40th Anniversary Voucher,2,NaT,NaT,,,,,,0.0
4,18251,2014-04-24,Phone,Ticketing,OP - US Mail,1044.0,6000.0,Perelman,Ariadne auf Naxos,2,NaT,NaT,,,,,,0.0
