In [4]:
import pandas as pd
import numpy as np

In [44]:
prod_data_train = pd.read_csv('production_data_train.csv')
prod_data_test = pd.read_csv('production_data_test.csv')
ihs_data = pd.read_csv('IHS_data.csv')
harmony_data = pd.read_csv('Harmony_data.csv')

In [45]:
def preprocess(prod_data, wells_data):
    prod_data = prod_data.drop_duplicates(subset=['API', 'Month', 'Year'], keep='last', inplace=False)
    
    # adding zeros to API
    prod_data['API'] = (prod_data['API']).astype(str)
    prod_data['API'] = prod_data['API'].apply(lambda x: x.zfill(14))
    
    # adding zeros to API
    wells_data['API'] = (wells_data['API']).astype(str)
    wells_data['API'] = wells_data['API'].apply(lambda x: x.zfill(14))
    
    # storing peak records 
    idx_max = prod_data.groupby(['API'])['Liquid'].transform('max') == prod_data['Liquid']
    max_month_prod_data = prod_data[idx_max].drop_duplicates(subset='API', keep='first', inplace=False)
    list_indices = ['API', 'Year', 'Month', 'Liquid']
    max_month_prod_data = max_month_prod_data[list_indices]
    max_month_prod_data = max_month_prod_data.rename(columns={"Year": "Max_Year", "Month": "Max_Month", "Liquid": "Max_Liquid"})
    
    # merging the two dataframes to get max month and max year
    new_prod_data_orig = prod_data.merge(max_month_prod_data, on='API')
    
    # Remove Pre-Peak Months (clean up)
    new_prod_data = new_prod_data_orig[((new_prod_data_orig['Year'] == new_prod_data_orig['Max_Year']))]
    new_prod_data = new_prod_data[(new_prod_data['Month'] >= new_prod_data['Max_Month'])]

    new_prod_data2 = new_prod_data_orig[((new_prod_data_orig['Year'] > new_prod_data_orig['Max_Year']))]
    new_prod_data3 = new_prod_data.append(new_prod_data2)
    
    # adding month index column to post peak production data
    new_prod_data3['index'] = calc_month_index(new_prod_data3['Max_Year'], new_prod_data3['Max_Month'], new_prod_data3['Year'], new_prod_data3['Month'])
    indexed_prod_data = new_prod_data3
    
    # removed nullified SpudDates and CompletionDates
    wells_data = wells_data[~((wells_data['SpudDate'].isnull()) & (wells_data['CompletionDate'].isnull()))]
    
    # replacing null CompletionDates with SpudDates + six months
    wells_data['SpudDate'] = pd.to_datetime(wells_data['SpudDate'])
    wells_data['CompletionDate'] = pd.to_datetime(wells_data['CompletionDate'])
    wells_data.loc[wells_data['CompletionDate'].isnull(), 'CompletionDate'] = wells_data['SpudDate'] + timedelta(days=170) 
    
    indexed_prod_data = indexed_prod_data.merge(wells_data, on='API')
    
    three_years_data = indexed_prod_data[(indexed_prod_data['Max_Year'] < 2016) |((indexed_prod_data['Max_Year'] == 2016) & (indexed_prod_data['Max_Month'] == 1))] 
    three_years_data = three_years_data[three_years_data['index'] <= 36] 
    
    return three_years_data

In [46]:
# calculates month index
def calc_month_index(max_year, max_month, year, month):
    return (12 - max_month + (year - max_year - 1)*12 + month) * (year != max_year) + (year == max_year) * (month - max_month) + 1 

In [57]:
processed_train = preprocess(prod_data_train, ihs_data)
processed_test = preprocess(prod_data_test, ihs_data)

harmony_data['API'] = (harmony_data['API']).astype(str)
harmony_data['API'] = harmony_data['API'].apply(lambda x: x.zfill(14))

complete_prod_train = processed_train.merge(harmony_data, on='API')
complete_prod_test = processed_test.merge(harmony_data, on='API')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [58]:
pd.set_option('display.max_columns', 500)
complete_prod_train

Unnamed: 0,API,Year,Month,Liquid,Gas,RatioGasOil,Water,PercentWater,DaysOn,_LastUpdate,Max_Year,Max_Month,Max_Liquid,index,PermitDate,SpudDate,CompletionDate,FirstProductionDate,operatorNameIHS,formation,BasinName,StateName,CountyName,LatWGS84,LonWGS84,BottomHoleLatitude,BottomHoleLongitude,LATERAL_LENGTH_BLEND,PROP_PER_FOOT,WATER_PER_FOOT,GOR_30,GOR_60,GOR_90
0,33053063590000,2015,4,18796,21796,1.159608,7255,0.385986,30,2016-08-11 14:03:14.000,2015,4,18796,1,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
1,33053063590000,2015,5,14626,15988,1.093122,4213,0.288049,31,2016-08-11 14:03:14.000,2015,4,18796,2,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
2,33053063590000,2015,6,10421,12311,1.181365,2717,0.260724,30,2016-04-06 13:47:11.383,2015,4,18796,3,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
3,33053063590000,2015,7,9777,11254,1.151069,2399,0.245372,31,2016-04-06 13:47:11.383,2015,4,18796,4,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
4,33053063590000,2015,8,10613,12188,1.148403,2742,0.258362,31,2016-04-06 13:47:11.383,2015,4,18796,5,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
5,33053063590000,2015,9,13119,17739,1.352161,2903,0.221282,30,2016-04-06 13:47:11.383,2015,4,18796,6,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
6,33053063590000,2015,10,10318,12979,1.257899,2286,0.221555,31,2016-04-06 13:47:11.383,2015,4,18796,7,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
7,33053063590000,2015,11,8126,9642,1.186562,1795,0.220896,30,2016-04-06 13:47:11.383,2015,4,18796,8,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
8,33053063590000,2015,12,7527,7570,1.005713,1679,0.223064,31,2016-08-11 14:03:14.000,2015,4,18796,9,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
9,33053063590000,2016,1,7934,8823,1.112049,1943,0.244895,31,2016-08-11 14:03:14.000,2015,4,18796,10,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055


In [59]:
cols = ['Year', 'Month', 'Gas', 'Water', 'Max_Year', 'Max_Liquid', 'Max_Month', 'SpudDate', 'PermitDate', '_LastUpdate']
complete_prod_train = complete_prod_train.drop(cols, axis=1)
complete_prod_test = complete_prod_test.drop(cols, axis=1)

In [60]:
complete_prod_test

Unnamed: 0,API,Liquid,RatioGasOil,PercentWater,DaysOn,date,index,CompletionDate,FirstProductionDate,operatorNameIHS,formation,BasinName,StateName,CountyName,LatWGS84,LonWGS84,BottomHoleLatitude,BottomHoleLongitude,LATERAL_LENGTH_BLEND,PROP_PER_FOOT,WATER_PER_FOOT,GOR_30,GOR_60,GOR_90
0,33053052530100,19518,1.763193,0.552618,31,2014-07-01,1,2014-04-15,2014-04-01 00:00:00.000,XTO ENERGY INCORPORATED,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.924744,-103.239808,47.92351,-103.19841,4583,586.6433,455.9575,1763.192950,1747.418087,1955.398230
1,33053052530100,18632,1.730893,0.534511,31,2014-08-01,2,2014-04-15,2014-04-01 00:00:00.000,XTO ENERGY INCORPORATED,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.924744,-103.239808,47.92351,-103.19841,4583,586.6433,455.9575,1763.192950,1747.418087,1955.398230
2,33053052530100,12700,2.580157,0.592756,30,2014-09-01,3,2014-04-15,2014-04-01 00:00:00.000,XTO ENERGY INCORPORATED,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.924744,-103.239808,47.92351,-103.19841,4583,586.6433,455.9575,1763.192950,1747.418087,1955.398230
3,33053061840000,24342,1.131131,0.236299,28,2015-01-01,1,2015-01-13,2015-01-01 00:00:00.000,HESS BAKKEN INVESTMENTS II LLC,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.733459,-103.472972,47.76064,-103.47362,9170,210.5043,164.4538,1131.131378,1230.246847,1279.864034
4,33053061840000,12766,1.419239,0.324534,28,2015-02-01,2,2015-01-13,2015-01-01 00:00:00.000,HESS BAKKEN INVESTMENTS II LLC,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.733459,-103.472972,47.76064,-103.47362,9170,210.5043,164.4538,1131.131378,1230.246847,1279.864034
5,33053061840000,6138,1.579831,0.334148,19,2015-03-01,3,2015-01-13,2015-01-01 00:00:00.000,HESS BAKKEN INVESTMENTS II LLC,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.733459,-103.472972,47.76064,-103.47362,9170,210.5043,164.4538,1131.131378,1230.246847,1279.864034
6,33053061840000,8023,1.363954,0.309859,25,2015-04-01,4,2015-01-13,2015-01-01 00:00:00.000,HESS BAKKEN INVESTMENTS II LLC,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.733459,-103.472972,47.76064,-103.47362,9170,210.5043,164.4538,1131.131378,1230.246847,1279.864034
7,33053061840000,7387,1.691891,0.309056,31,2015-07-01,7,2015-01-13,2015-01-01 00:00:00.000,HESS BAKKEN INVESTMENTS II LLC,BAKKEN,WILLISTON BASIN,NORTH DAKOTA,MCKENZIE,47.733459,-103.472972,47.76064,-103.47362,9170,210.5043,164.4538,1131.131378,1230.246847,1279.864034
8,05005072030100,6357,0.000000,0.688690,31,2014-03-01,1,2014-02-14,2014-02-01 00:00:00.000,CONOCOPHILLIPS COMPANY,NIOBRARA,DENVER BASIN,COLORADO,ARAPAHOE,39.710797,-104.490761,39.71121,-104.50620,3988,1727.1980,1062.9910,746.263961,749.258435,902.225008
9,05005072030100,4431,0.000000,0.763485,30,2014-04-01,2,2014-02-14,2014-02-01 00:00:00.000,CONOCOPHILLIPS COMPANY,NIOBRARA,DENVER BASIN,COLORADO,ARAPAHOE,39.710797,-104.490761,39.71121,-104.50620,3988,1727.1980,1062.9910,746.263961,749.258435,902.225008


In [87]:
# data prep

def data_prep(prod_data, arr):
    prod_data = prod_data.groupby('API').apply(generate_input, arr)
    

In [88]:
def generate_input(group, arr):
    group = np.array(group)
    if len(group) >= 36:
        for i in range(len(group) - 2):
            cur_part = group[i:i+3]
            arr.append(list(cur_part))
        

In [90]:
nourhan = []
data_prep(complete_prod_train, nourhan)
print(nourhan[7]) 

[array(['05001097720000', 1374, 1.36972343522562, 0.240902474526929, 29, 8,
       Timestamp('2014-09-01 00:00:00'), '2014-08-01 00:00:00.000',
       'GREAT WESTERN OPERATING COMPANY LLC', 'NIOBRARA', 'DENVER BASIN',
       'COLORADO', 'ADAMS', 39.970960001, -104.923990012, 39.95874,
       -104.92343, 4358, 760.4881, 934.0413, 2841.9402595140637,
       3126.395856934779, 3270.499448825574], dtype=object), array(['05001097720000', 1284, 1.69392523364486, 0.130841121495327, 26, 9,
       Timestamp('2014-09-01 00:00:00'), '2014-08-01 00:00:00.000',
       'GREAT WESTERN OPERATING COMPANY LLC', 'NIOBRARA', 'DENVER BASIN',
       'COLORADO', 'ADAMS', 39.970960001, -104.923990012, 39.95874,
       -104.92343, 4358, 760.4881, 934.0413, 2841.9402595140637,
       3126.395856934779, 3270.499448825574], dtype=object), array(['05001097720000', 936, 1.34722222222222, 0.0908119658119658, 16,
       10, Timestamp('2014-09-01 00:00:00'), '2014-08-01 00:00:00.000',
       'GREAT WESTERN OPERATING C

In [91]:
print(nourhan[len(nourhan)-1])

[array(['49021226580100', 467, 3.1134903640257, 1.90578158458244, 31, 34,
       Timestamp('2015-10-20 00:00:00'), '2015-10-01 00:00:00.000',
       'KAISER-FRANCIS OIL COMPANY', 'NIOBRARA', 'DENVER BASIN',
       'WYOMING', 'LARAMIE', 41.327090035, -104.558119994, 41.32778,
       -104.57682, 4249, 538.438, nan, 355.6751467710369,
       410.450966356478, 464.871591809028], dtype=object), array(['49021226580100', 414, 3.41304347826087, 1.80193236714976, 30, 35,
       Timestamp('2015-10-20 00:00:00'), '2015-10-01 00:00:00.000',
       'KAISER-FRANCIS OIL COMPANY', 'NIOBRARA', 'DENVER BASIN',
       'WYOMING', 'LARAMIE', 41.327090035, -104.558119994, 41.32778,
       -104.57682, 4249, 538.438, nan, 355.6751467710369,
       410.450966356478, 464.871591809028], dtype=object), array(['49021226580100', 412, 3.0145631067961203, 1.87864077669903, 31,
       36, Timestamp('2015-10-20 00:00:00'), '2015-10-01 00:00:00.000',
       'KAISER-FRANCIS OIL COMPANY', 'NIOBRARA', 'DENVER BASIN',
     