In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [2]:
def load_data(file_location):
    """
    Load data from a CSV file and separate it into features and target variables.

    Parameters:
    -----------
    file_location : str
        The path to the CSV file containing the dataset.

    Returns:
    --------
    features : pandas.DataFrame
        A DataFrame containing all columns except the first and the last.
    
    target : pandas.Series
        A Series containing the last column of the dataset.

    Notes:
    ------
    - The function assumes the dataset has at least two columns.
    - The first column is ignored, and the last column is treated as the target variable.
    - Requires pandas to be imported as `pd`.
    """
    data = pd.read_csv(file_location)
    features = data.iloc[:,1:-1]
    target = data.iloc[:,-1]
    return features,target

In [3]:
def split_data(features,target,state,size=0.25):
    """
    Split the dataset into training and validation sets.

    Parameters:
    -----------
    features : pandas.DataFrame
        The feature variables (independent variables).
    
    target : pandas.Series
        The target variable (dependent variable).
    
    state : int
        The random seed for reproducibility.
    
    size : float, optional (default=0.25)
        The proportion of the dataset to include in the validation split.

    Returns:
    --------
    features_train : pandas.DataFrame
        The training set features.
    
    features_valid : pandas.DataFrame
        The validation set features.
    
    target_train : pandas.Series
        The training set target values.
    
    target_valid : pandas.Series
        The validation set target values.

    Notes:
    ------
    - Uses `train_test_split` from `sklearn.model_selection`, so ensure it is imported.
    - `random_state` is used for reproducibility.
    - Default validation size is 25% of the dataset.
    """
    features_train, features_valid, target_train, target_valid = train_test_split(features,target,test_size=size,random_state=state)
    return features_train, features_valid, target_train, target_valid

In [4]:
def model(features_train, features_valid, target_train, target_valid):
    """
    Train a Linear Regression model and evaluate its performance.

    Parameters:
    -----------
    features_train : pandas.DataFrame
        The training set features.
    
    features_valid : pandas.DataFrame
        The validation set features.
    
    target_train : pandas.Series
        The training set target values.
    
    target_valid : pandas.Series
        The validation set target values.

    Returns:
    --------
    predictions : numpy.ndarray
        The predicted values for the validation set.

    Prints:
    -------
    - Mean reserve volume predicted.
    - Root Mean Squared Error (RMSE) of the model.

    Notes:
    ------
    - Uses `LinearRegression` from `sklearn.linear_model`, so ensure it is imported.
    - Assumes `root_mean_squared_error` is defined elsewhere or imported.
    """
    model = LinearRegression()
    model.fit(features_train,target_train)
    predictions = model.predict(features_valid)
    print(f"Mean reserve volume predicted: {predictions.mean()}")
    print(f"RSME: {root_mean_squared_error(target_valid,predictions)}")
    return predictions

In [5]:
def process(file_location):
    """
    Load data, split it into training and validation sets, train a model, and generate predictions.

    Parameters:
    -----------
    file_location : str
        The path to the CSV file containing the dataset.

    Returns:
    --------
    predictions : numpy.ndarray
        The predicted values for the validation set.

    target_valid : pandas.Series
        The actual target values for the validation set.

    Notes:
    ------
    - Calls `load_data()` to load and preprocess the dataset.
    - Uses `np.random.RandomState(42)` to ensure reproducibility.
    - Calls `split_data()` to divide the data into training and validation sets.
    - Calls `model()` to train a Linear Regression model and generate predictions.
    - Assumes that `load_data`, `split_data`, and `model` are defined elsewhere.
    - Requires `numpy` to be imported as `np`.
    """
    features, target = load_data(file_location)
    state = np.random.RandomState(42)
    features_train, features_valid, target_train, target_valid = split_data(features,target,state)
    predictions = model(features_train, features_valid, target_train, target_valid)
    return predictions,target_valid

In [6]:
predictions_0,target_valid_0 = process("datasets/geo_data_0.csv")

Mean reserve volume predicted: 92.39879990657768
RSME: 37.75660035026169


In [7]:
predictions_1,target_valid_1 = process("datasets/geo_data_1.csv")

Mean reserve volume predicted: 68.71287803913762
RSME: 0.8902801001028854


In [8]:
predictions_2,target_valid_2 = process("datasets/geo_data_2.csv")

Mean reserve volume predicted: 94.77102387765939
RSME: 40.145872311342174


In [9]:
investment = 100000000
gain_per_unit = 4500
n_oil_wells = 200

In [11]:
print(f"Average volume of oil per oil well to avoid losses: {investment/n_oil_wells/gain_per_unit:.2f}")

Average volume of oil per oil well to avoid losses: 111.11


- La región 0 tiene en promedio 9 unidades menos de las necesarias para evitar pérdidas. Pero el error es de 37 unidades y se toman en cuenta todos los pozos y no sólo los mejores 200, por lo que hay muy alta posibilidad de que sea una buena región.
- La región 1 tiene en promedio 42 unidades menos de las necesarias para evitar pérdidas. Aunque se tomen a los mejores 200 pozos, el error es mínimo en esta región y el déficit es bastante grande, por lo que tal vez no sea tan buena idea invertir aquí.
- La región 2 es muy similar a la 0, con un poco más de unidades en promedio pero con un poco más de error. Ambas regiones, 0 y 2, son potencialmente buenas.