In [57]:
import pandas as pd
import numpy as np
from datetime import timedelta
import random
from sklearn.neural_network import MLPRegressor

In [58]:
prod_data_train = pd.read_csv('production_data_train.csv')
prod_data_test = pd.read_csv('production_data_test.csv')
ihs_data = pd.read_csv('IHS_data.csv')
harmony_data = pd.read_csv('Harmony_data.csv')
test_apis = pd.read_csv('test_APIs.csv', header=None)
test_apis.columns = ['API']
test_apis['API'] = (test_apis['API']).astype(str)
test_apis['API'] = test_apis['API'].apply(lambda x: x.zfill(14))
sample_file = pd.read_csv('sample_file.csv')    

In [59]:
def preprocess(prod_data, wells_data, train):
    prod_data = prod_data.drop_duplicates(subset=['API', 'Month', 'Year'], keep='last', inplace=False)
    
    # adding zeros to API
    prod_data['API'] = (prod_data['API']).astype(str)
    prod_data['API'] = prod_data['API'].apply(lambda x: x.zfill(14))
    
    # adding zeros to API
    wells_data['API'] = (wells_data['API']).astype(str)
    wells_data['API'] = wells_data['API'].apply(lambda x: x.zfill(14))
    
    # storing peak records 
    idx_max = prod_data.groupby(['API'])['Liquid'].transform('max') == prod_data['Liquid']
    max_month_prod_data = prod_data[idx_max].drop_duplicates(subset='API', keep='first', inplace=False)
    list_indices = ['API', 'Year', 'Month', 'Liquid']
    max_month_prod_data = max_month_prod_data[list_indices]
    max_month_prod_data = max_month_prod_data.rename(columns={"Year": "Max_Year", "Month": "Max_Month", "Liquid": "Max_Liquid"})
    
    # merging the two dataframes to get max month and max year
    new_prod_data_orig = prod_data.merge(max_month_prod_data, on='API')
#     print(new_prod_data_orig['API'].drop_duplicates())
    # Remove Pre-Peak Months (clean up)
    new_prod_data = new_prod_data_orig[((new_prod_data_orig['Year'] == new_prod_data_orig['Max_Year']))]
    new_prod_data = new_prod_data[(new_prod_data['Month'] >= new_prod_data['Max_Month'])]

    new_prod_data2 = new_prod_data_orig[((new_prod_data_orig['Year'] > new_prod_data_orig['Max_Year']))]
    new_prod_data3 = new_prod_data.append(new_prod_data2)
    # adding month index column to post peak production data
    new_prod_data3['index'] = calc_month_index(new_prod_data3['Max_Year'], new_prod_data3['Max_Month'], new_prod_data3['Year'], new_prod_data3['Month'])
    indexed_prod_data = new_prod_data3
    
    # removed nullified SpudDates and CompletionDates
    wells_data = wells_data[~((wells_data['SpudDate'].isnull()) & (wells_data['CompletionDate'].isnull()))]
    
    # replacing null CompletionDates with SpudDates + six months
    wells_data['SpudDate'] = pd.to_datetime(wells_data['SpudDate'])
    wells_data['CompletionDate'] = pd.to_datetime(wells_data['CompletionDate'])
    wells_data.loc[wells_data['CompletionDate'].isnull(), 'CompletionDate'] = wells_data['SpudDate'] + timedelta(days=170) 
    
    # replacing StateNames with indices
    unique_state_names = wells_data.StateName.unique()
    unique_state_ids = list(range(0, len(unique_state_names)))
    dict_state_names = dict(zip( unique_state_names, unique_state_ids))
    wells_data['StateName'] = wells_data['StateName'].map(dict_state_names)
                            
    # replacing CountyNames with indices                         
    unique_county_names = wells_data.CountyName.unique()
    unique_county_ids = list(range(0, len(unique_county_names)))
    dict_county_names = dict(zip(unique_county_names, unique_county_ids))
    wells_data['CountyName'] = wells_data['CountyName'].map(dict_county_names)
    
    # replacing BasinName with indices
    unique_basin_names = wells_data.BasinName.unique()
    unique_basin_ids = list(range(0, len(unique_basin_names)))
    dict_basin_names = dict(zip(unique_basin_names, unique_basin_ids))
    wells_data['BasinName'] = wells_data['BasinName'].map(dict_basin_names)
    
    #replacing Formation with indices
    unique_formation_names = wells_data.formation.unique()
    unique_formation_ids = list(range(0, len(unique_formation_names)))
    dict_formation_names = dict(zip(unique_formation_names, unique_formation_ids))
    wells_data['formation'] = wells_data['formation'].map(dict_formation_names)
        
#     wells_data[wells_data['CompletionDate'] >= pd.Timestamp(2014, 1 , 1)]['CompletionDate'] = 1  
#     wells_data[wells_data['CompletionDate'] !=  1]['CompletionDate'] = 0 
#     print(wells_data[wells_data['CompletionDate'] < pd.Timestamp(2014, 1 , 1)])

    indexed_prod_data = indexed_prod_data.merge(wells_data, on='API')
    three_years_data = indexed_prod_data
    if(train):
        three_years_data = indexed_prod_data[(indexed_prod_data['Max_Year'] < 2016) |((indexed_prod_data['Max_Year'] == 2016) & (indexed_prod_data['Max_Month'] == 1))] 
        three_years_data = three_years_data[three_years_data['index'] <= 36] 
    
    return three_years_data

In [60]:
# calculates month index
def calc_month_index(max_year, max_month, year, month):
    return (12 - max_month + (year - max_year - 1)*12 + month) * (year != max_year) + (year == max_year) * (month - max_month) + 1 

In [89]:
processed_train = preprocess(prod_data_train, ihs_data, True)
processed_test = preprocess(prod_data_test, ihs_data, False)

harmony_data['API'] = (harmony_data['API']).astype(str)
harmony_data['API'] = harmony_data['API'].apply(lambda x: x.zfill(14))

harmony_data[harmony_data['WATER_PER_FOOT'].isnull()] = 0
harmony_data[harmony_data['PROP_PER_FOOT'].isnull()] = 0



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [90]:
complete_prod_train = processed_train.merge(harmony_data, on='API')
complete_prod_test = processed_test.merge(harmony_data, on='API', how='outer')
complete_prod_test.fillna(0, inplace=True)
complete_prod_test = complete_prod_test.merge(test_apis, on='API')
# removing production of month index greater that 3
# complete_prod_test = complete_prod_test[complete_prod_test['index'] <= 3]

# replacing gor nans with zeros
complete_prod_train.fillna(0, inplace=True)

In [91]:
cols_train = ['Gas', 'Water', 'Max_Year','operatorNameIHS', 'CompletionDate', 'FirstProductionDate', 'Max_Liquid', 
              'Max_Month', 'SpudDate', 'PermitDate', '_LastUpdate','BasinName', 'StateName', 'CountyName', 'DaysOn']
cols_test = ['Gas', 'date', 'Water', 'Max_Year','operatorNameIHS', 'CompletionDate', 'FirstProductionDate', 'Max_Liquid',
             'Max_Month', 'SpudDate', 'PermitDate', '_LastUpdate', 'BasinName', 'StateName','CountyName', 'DaysOn']
complete_prod_train = complete_prod_train.drop(cols_train, axis=1)
complete_prod_test = complete_prod_test.drop(cols_test, axis=1)

In [92]:
len(complete_prod_test['API'].unique())

3331

In [93]:
# input first three months and y label is cumulative
# predict missing entries in test data by averaging
# in train disregard qthings with less than 36
testing_apis = []
def get_three_months(prod_data, feat_arr, y_label, test):
    prod_data = (prod_data.groupby('API').apply(generate_input, feat_arr, y_label, test))
    
def generate_input(group, feat_arr, y_label, test):
    global testing_apis
    y_label.append(group['Liquid'].sum())
    testing_apis.append(group['API'].max())
    if len(group) < 2:
        print("API", group['API'])
        
    group = group.drop(['API'], axis=1)
    records = group[:3]
        
    if len(records) == 3 and not test:
        feat_arr.append(list(np.array(records).flatten()))
        
    elif test and len(records) < 3:
        idx_1 = records[records['index'] == 1]
        idx_2 = records[records['index'] == 2]
        idx_3 = records[records['index'] == 3]
                
#         if idx_1['Liquid'].isnan():
#             # case 1, idx 1 missing
#             missing_rec = idx_2
#             missing_rec['Liquid'] = idx_2['Liquid'] + (idx_2['Liquid'] - idx_3['Liquid']).abs()
#             missing_rec['index'] = 1
#             arr = np.vstack((missing_rec, idx_2, idx_3))
#             feat_arr.append(list(arr.flatten()))
        
#         if idx_2['Liquid'].isnan():
#             # case 2, idx 2 missing
        missing_rec = idx_1
        missing_rec['Liquid'] = 4 #(idx_1['Liquid'] + idx_3['Liquid'])/2
        missing_rec['index'] = 2
        arr = np.vstack((idx_1, missing_rec, idx_3))
        feat_arr.append(list(arr.flatten()))

#         if idx_3['Liquid'].isnan():
#             # case 3, idx 3 missing
#             missing_rec = idx_1
#             missing_rec['Liquid'] = (idx_1['Liquid'] - idx_2['Liquid']).abs() + idx_2
#             missing_rec['index'] = 3
#             arr = np.vstack((idx_1, idx_2, missing_rec))
#             feat_arr.append(list(arr.flatten()))
        
    else:
        feat_arr.append(list(np.array(records).flatten()))
            

In [94]:
input_feats = []
y_labels = []
get_three_months(complete_prod_train, input_feats, y_labels, False)
valid_dataset = []


In [95]:
np.array(input_feats).shape

(7235, 51)

In [96]:
# prepare testing data
x_test = []
testing_apis = []
get_three_months(complete_prod_test, x_test, [], True)
testing_apis = testing_apis[1:]

In [108]:
regressor = MLPRegressor(hidden_layer_sizes=(50,50), activation='relu', solver='adam', 
                         alpha=0.0001, batch_size='auto',
                         learning_rate_init=0.001, power_t=0.5, max_iter=200, 
                         shuffle=True, random_state=None, tol=0.0001, verbose=False, 
                         warm_start=False, momentum=0.9, nesterovs_momentum=True, 
                         early_stopping=True, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, 
                         epsilon=1e-08)
regressor.fit(input_feats, y_labels)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [109]:
predictions = regressor.predict(x_test)

In [110]:
predictions = np.array(predictions)
predictions = predictions[1:]

In [111]:
data = []
for i in range(len(testing_apis)):
    data.append([testing_apis[i], predictions[i]])
output_df = pd.DataFrame(data, columns = ['Id', 'Predicted'])
output_df.to_csv('three_yrs_cum.csv',index=False)

In [112]:
len(test_apis)

3331