In [4]:
import pandas as pd
import numpy as np

In [178]:
prod_data_train = pd.read_csv('production_data_train.csv')
prod_data_test = pd.read_csv('production_data_test.csv')
ihs_data = pd.read_csv('IHS_data.csv')
harmony_data = pd.read_csv('Harmony_data.csv')

In [179]:
def preprocess(prod_data, wells_data):
    prod_data = prod_data.drop_duplicates(subset=['API', 'Month', 'Year'], keep='last', inplace=False)
    
    # adding zeros to API
    prod_data['API'] = (prod_data['API']).astype(str)
    prod_data['API'] = prod_data['API'].apply(lambda x: x.zfill(14))
    
    # adding zeros to API
    wells_data['API'] = (wells_data['API']).astype(str)
    wells_data['API'] = wells_data['API'].apply(lambda x: x.zfill(14))
    
    # storing peak records 
    idx_max = prod_data.groupby(['API'])['Liquid'].transform('max') == prod_data['Liquid']
    max_month_prod_data = prod_data[idx_max].drop_duplicates(subset='API', keep='first', inplace=False)
    list_indices = ['API', 'Year', 'Month', 'Liquid']
    max_month_prod_data = max_month_prod_data[list_indices]
    max_month_prod_data = max_month_prod_data.rename(columns={"Year": "Max_Year", "Month": "Max_Month", "Liquid": "Max_Liquid"})
    
    # merging the two dataframes to get max month and max year
    new_prod_data_orig = prod_data.merge(max_month_prod_data, on='API')
    
    # Remove Pre-Peak Months (clean up)
    new_prod_data = new_prod_data_orig[((new_prod_data_orig['Year'] == new_prod_data_orig['Max_Year']))]
    new_prod_data = new_prod_data[(new_prod_data['Month'] >= new_prod_data['Max_Month'])]

    new_prod_data2 = new_prod_data_orig[((new_prod_data_orig['Year'] > new_prod_data_orig['Max_Year']))]
    new_prod_data3 = new_prod_data.append(new_prod_data2)
    
    # adding month index column to post peak production data
    new_prod_data3['index'] = calc_month_index(new_prod_data3['Max_Year'], new_prod_data3['Max_Month'], new_prod_data3['Year'], new_prod_data3['Month'])
    indexed_prod_data = new_prod_data3
    
    # removed nullified SpudDates and CompletionDates
    wells_data = wells_data[~((wells_data['SpudDate'].isnull()) & (wells_data['CompletionDate'].isnull()))]
    
    # replacing null CompletionDates with SpudDates + six months
    wells_data['SpudDate'] = pd.to_datetime(wells_data['SpudDate'])
    wells_data['CompletionDate'] = pd.to_datetime(wells_data['CompletionDate'])
    wells_data.loc[wells_data['CompletionDate'].isnull(), 'CompletionDate'] = wells_data['SpudDate'] + timedelta(days=170) 
    
    # replacing StateNames with indices
    unique_state_names = wells_data.StateName.unique()
    unique_state_ids = list(range(0, len(unique_state_names)))
    dict_state_names = dict(zip( unique_state_names, unique_state_ids))
    wells_data['StateName'] = wells_data['StateName'].map(dict_state_names)
                            
    # replacing CountyNames with indices                         
    unique_county_names = wells_data.CountyName.unique()
    unique_county_ids = list(range(0, len(unique_county_names)))
    dict_county_names = dict(zip(unique_county_names, unique_county_ids))
    wells_data['CountyName'] = wells_data['CountyName'].map(dict_county_names)
    
    # replacing BasinName with indices
    unique_basin_names = wells_data.BasinName.unique()
    unique_basin_ids = list(range(0, len(unique_basin_names)))
    dict_basin_names = dict(zip(unique_basin_names, unique_basin_ids))
    wells_data['BasinName'] = wells_data['BasinName'].map(dict_basin_names)
    
    #replacing Formation with indices
    unique_formation_names = wells_data.formation.unique()
    unique_formation_ids = list(range(0, len(unique_formation_names)))
    dict_formation_names = dict(zip(unique_formation_names, unique_formation_ids))
    wells_data['formation'] = wells_data['formation'].map(dict_formation_names)
        
#     wells_data[wells_data['CompletionDate'] >= pd.Timestamp(2014, 1 , 1)]['CompletionDate'] = 1  
#     wells_data[wells_data['CompletionDate'] !=  1]['CompletionDate'] = 0 
#     print(wells_data[wells_data['CompletionDate'] < pd.Timestamp(2014, 1 , 1)])

    indexed_prod_data = indexed_prod_data.merge(wells_data, on='API')
    
    three_years_data = indexed_prod_data[(indexed_prod_data['Max_Year'] < 2016) |((indexed_prod_data['Max_Year'] == 2016) & (indexed_prod_data['Max_Month'] == 1))] 
    three_years_data = three_years_data[three_years_data['index'] <= 36] 
    
    return three_years_data

In [180]:
# calculates month index
def calc_month_index(max_year, max_month, year, month):
    return (12 - max_month + (year - max_year - 1)*12 + month) * (year != max_year) + (year == max_year) * (month - max_month) + 1 

In [181]:
processed_train = preprocess(prod_data_train, ihs_data)
processed_test = preprocess(prod_data_test, ihs_data)

harmony_data['API'] = (harmony_data['API']).astype(str)
harmony_data['API'] = harmony_data['API'].apply(lambda x: x.zfill(14))

harmony_data[harmony_data['WATER_PER_FOOT'].isnull()] = 0
harmony_data[harmony_data['PROP_PER_FOOT'].isnull()] = 0

complete_prod_train = processed_train.merge(harmony_data, on='API')
complete_prod_test = processed_test.merge(harmony_data, on='API')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [182]:
pd.set_option('display.max_columns', 500)
complete_prod_train

Unnamed: 0,API,Year,Month,Liquid,Gas,RatioGasOil,Water,PercentWater,DaysOn,_LastUpdate,Max_Year,Max_Month,Max_Liquid,index,PermitDate,SpudDate,CompletionDate,FirstProductionDate,operatorNameIHS,formation,BasinName,StateName,CountyName,LatWGS84,LonWGS84,BottomHoleLatitude,BottomHoleLongitude,LATERAL_LENGTH_BLEND,PROP_PER_FOOT,WATER_PER_FOOT,GOR_30,GOR_60,GOR_90
0,33053063590000,2015,4,18796,21796,1.159608,7255,0.385986,30,2016-08-11 14:03:14.000,2015,4,18796,1,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
1,33053063590000,2015,5,14626,15988,1.093122,4213,0.288049,31,2016-08-11 14:03:14.000,2015,4,18796,2,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
2,33053063590000,2015,6,10421,12311,1.181365,2717,0.260724,30,2016-04-06 13:47:11.383,2015,4,18796,3,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
3,33053063590000,2015,7,9777,11254,1.151069,2399,0.245372,31,2016-04-06 13:47:11.383,2015,4,18796,4,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
4,33053063590000,2015,8,10613,12188,1.148403,2742,0.258362,31,2016-04-06 13:47:11.383,2015,4,18796,5,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
5,33053063590000,2015,9,13119,17739,1.352161,2903,0.221282,30,2016-04-06 13:47:11.383,2015,4,18796,6,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
6,33053063590000,2015,10,10318,12979,1.257899,2286,0.221555,31,2016-04-06 13:47:11.383,2015,4,18796,7,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
7,33053063590000,2015,11,8126,9642,1.186562,1795,0.220896,30,2016-04-06 13:47:11.383,2015,4,18796,8,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
8,33053063590000,2015,12,7527,7570,1.005713,1679,0.223064,31,2016-08-11 14:03:14.000,2015,4,18796,9,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055
9,33053063590000,2016,1,7934,8823,1.112049,1943,0.244895,31,2016-08-11 14:03:14.000,2015,4,18796,10,2014-10-09,2015-01-03,2015-03-17,2015-03-01 00:00:00.000,NEWFIELD PRODUCTION COMPANY,0,0,0,2,47.702514,-102.861247,47.67482,-102.85867,9762,316.453,314.3408,1416.471590,1448.746335,1469.060055


In [183]:
cols = ['Year', 'Month', 'Gas', 'Water', 'Max_Year','operatorNameIHS', 'CompletionDate', 'FirstProductionDate', 'Max_Liquid', 'Max_Month', 'SpudDate', 'PermitDate', '_LastUpdate']
complete_prod_train = complete_prod_train.drop(cols, axis=1)
complete_prod_test = complete_prod_test.drop(cols, axis=1)

In [193]:
def data_prep(prod_data, arr_inp, arr_out):
    prod_data = prod_data.groupby('API').apply(generate_input, arr_inp, arr_out)

In [194]:
def generate_input(group, arr_inp, arr_out):
    group = np.array(group)
    if len(group) >= 36:
        for i in range(len(group) - 3):
            cur_part = group[i:i+3, 1:]
            cur_shift = group[i+1:i+4, 1:5]
            arr_inp.append(list(cur_part))
            arr_out.append(list(cur_shift))
    elif len(group) >= 3:
        for i in range(len(group) - 2):
            cur_part = group[i:i+3, 1:]
            arr_inp.append(list(cur_part))
            cur_shift = []
            if i == len(group) - 3:
                cur_shift = list(group[i+1:i+3, 1:5])
                cur_shift.append([0,0,0,0])
            else:
                cur_shift = list(group[i+1:i+4, 1:5])
            arr_out.append(cur_shift)
            

In [197]:
nourhan = []
mariem = []
data_prep(complete_prod_train, nourhan, mariem)
print(nourhan[8]) 

[array([1284, 1.69392523364486, 0.130841121495327, 26, 9, 27, 2, 3, 17,
       39.970960001, -104.923990012, 39.95874, -104.92343, 4358, 760.4881,
       934.0413, 2841.9402595140637, 3126.395856934779, 3270.499448825574],
      dtype=object), array([936, 1.34722222222222, 0.0908119658119658, 16, 10, 27, 2, 3, 17,
       39.970960001, -104.923990012, 39.95874, -104.92343, 4358, 760.4881,
       934.0413, 2841.9402595140637, 3126.395856934779, 3270.499448825574],
      dtype=object), array([2737, 1.77603215199123, 0.0968213372305444, 19, 11, 27, 2, 3, 17,
       39.970960001, -104.923990012, 39.95874, -104.92343, 4358, 760.4881,
       934.0413, 2841.9402595140637, 3126.395856934779, 3270.499448825574],
      dtype=object)]


In [196]:
print(mariem[7])

[array([1284, 1.69392523364486, 0.130841121495327, 26], dtype=object), array([936, 1.34722222222222, 0.0908119658119658, 16], dtype=object), array([2737, 1.77603215199123, 0.0968213372305444, 19], dtype=object)]


#### Timestamp, Mapping basin names, Water_per_foot nan values
### Experiments:
set timestamp --> boolean greater than or equal 2014.
removed entries that last more than 3 years (try training after three years isA)
dropped completion date  because they are all after 1/1/2014 so no difference in drilling techniques
To be done: try out normalizing the features data (0->1)

In [92]:
harmony_data

Unnamed: 0,API,LATERAL_LENGTH_BLEND,PROP_PER_FOOT,WATER_PER_FOOT,GOR_30,GOR_60,GOR_90
0,05123371740000,9056,1025.86800,790.720500,2300.919963,2290.231375,2171.635752
1,05123377360000,4101,,,3786.669639,3838.096579,4012.142621
2,05123378210000,4161,916.08010,839.928000,2399.490505,2291.864862,2472.275585
3,05123378650100,4129,831.15310,,3907.549121,4122.576934,4553.377380
4,05123380130000,4539,1032.23300,755.134300,1758.110773,1323.960532,1182.969280
5,05123331840000,4012,,,3247.303236,3476.586103,3580.817730
6,05123362810000,3880,,,1780.627782,1675.569468,1713.457431
7,05123378830000,5127,531.60080,821.062700,4896.290909,4854.557155,5284.324071
8,05123377810000,7225,661.58840,477.713800,5053.053374,5212.441012,5510.159831
9,05123375140000,4104,624.54440,997.516300,2970.478326,3008.072553,3049.710564
