In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

In [2]:
import requests
import random
import xlrd
import csv
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
import matplotlib as mpl
import matplotlib.style
import seaborn as sns  
import matplotlib.pyplot as plt

In [4]:
from scipy.optimize import curve_fit
from sklearn.metrics import mean_squared_error
from math import sqrt
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARIMA

In [5]:
# setting to default parameters
plt.rcParams.update(plt.rcParamsDefault)

# formatting for decimal places
pd.set_option("display.float_format", "{:.2f}".format)


In [6]:

# matplotlib settings
mpl.rcParams.update(mpl.rcParamsDefault)
#plt.style.use('whitegrid')
mpl.rcParams["figure.figsize"] = (12, 8)
mpl.rcParams["axes.grid"] = False

In [7]:
# setting seed for model reproducibility
seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

In [8]:
# setting the destination for the data folder
path = os.path.join(os.getcwd(), "C:/Users/ASUS/Petroleum-Production-Engineering/src/data")
norm_path = os.path.normpath(path)

In [9]:
# defining a function to scrape NDIC data
# https://www.dmr.nd.gov/oilgas/
# data from May 2015 to December 2018 will be used as a training dataset
# data from 2019 will be used as a test dataset

In [None]:
# function to scrape data from NDIC
def scrape_ndic(months_list):
    '''function to scrape NDIC data'''
    # link to website with production data
    website = "https://www.dmr.nd.gov/oilgas/mpr/"
    df = pd.DataFrame()
    # loop through all of the dates in the list
    for period in months_list:
        url = website + period + ".xlsx"
        req = requests.get(url)
        book = xlrd.open_workbook(file_contents=req.content)
        sheet = book.sheet_by_index(0)
        for i in range(1, sheet.nrows):
            temp_value = sheet.cell_value(i, 0)
            year, month, day, hour, minute, second = xlrd.xldate_as_tuple(temp_value, book.datemode)
            sheet._cell_values[i][0] = datetime(year, month, 1).strftime("%m/%Y")
        new_file = (path + '\\'+ period + ".csv")
        csv_file = open(new_file, "w", newline="")
        writer = csv.writer(csv_file)
        # iteration through each row for data pull
        for rownum in range(sheet.nrows):
            writer.writerow(sheet.row_values(rownum))
        csv_file.close()
        df = pd.read_csv(new_file)
        df = df.append(df)
    # dataframe with entire monthly production
    return df


In [10]:
from openpyxl import load_workbook
from io import BytesIO

In [11]:
def scrape_ndic(months_list, path="."):
    """
    Function to scrape monthly NDIC production data
    and return a combined DataFrame.
    """
    website = "https://www.dmr.nd.gov/oilgas/mpr/"
    all_dfs = []

    for period in months_list:
        # Download XLSX file
        url = website + period + ".xlsx"
        req = requests.get(url)
        req.raise_for_status()

        # Load workbook from memory
        book = load_workbook(filename=BytesIO(req.content), read_only=True)
        sheet = book.active

        # Convert worksheet into list of lists
        data = []
        for row in sheet.iter_rows(values_only=True):
            data.append(list(row))

        # Transform into DataFrame
        df = pd.DataFrame(data[1:], columns=data[0])  # first row = headers

        # Ensure first column is Month/Year (if it is a date in Excel)
        if isinstance(df.iloc[0, 0], datetime):
            df.iloc[:, 0] = df.iloc[:, 0].apply(lambda d: datetime(d.year, d.month, 1).strftime("%m/%Y"))

        # Save to CSV if desired
        new_file = f"C:/Users/ASUS/Petroleum-Production-Engineering/src/data/{period}.csv"
        df.to_csv(new_file, index=False)

        all_dfs.append(df)

    # Concatenate all months into one DataFrame
    final_df = pd.concat(all_dfs, ignore_index=True)

    return final_df

In [12]:
train_list = ["2015_05"]

In [13]:
train_prod_data = scrape_ndic(train_list)
train_prod_data["ReportDate"] = pd.to_datetime(train_prod_data["ReportDate"])