In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import sklearn
from sklearn import svm, preprocessing
from sklearn.impute import SimpleImputer
import h2o
from h2o.automl import H2OAutoML

### Objective 

Because we consider the revenue a continuous variable, we call this a _regression_ problem. It is common to evaluate regression problems according to some deviation measure of the error (difference) between the predictions and the ground truth values. Typical choices are Mean Squared Error (MSE) and its square root, the Root Mean Squared Error (RMSE).

However, both of these measures are quite sensitive to extreme values and work best if the typical scale of prediction errors are consistent across the dataset. This is not likely to be the case here because the revenue variable varies a lot. This means that a, say 10%, prediction error would matter a lot more if it is for one of the higher-earning stores than for one of the lower-earning ones. Consequently, we will use a variation that takes a log transform of the target variable before computing prediction errors.

**TL;DR**: submissions for this problem will be evaluated according to the `Root Mean Squared Log Error` (RMSLE):

- $\text{RMSLE}(y, \hat{y}) = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (\log(1 + \hat{y}_i) - \log(1 + y_i))^2}$

In the equation above, $y_i$ corresponds to the ground truth value for datapoint $i$, $\hat{y}_i$ corresponds to the predicted value for datapoint $i$, and $n$ denotes the total number of datapoints (dimensionality of $y$, $\hat{y}$). See the cell below for an implementation.



In [23]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [39]:
df1= pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/train_data/dataset_train_1.csv')
df2= pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/train_data/dataset_train_2.csv')


df_train = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/train_data/dataset_train_without_rev_features.csv')
df_test = pd.read_csv('C:/Users/aminp/OneDrive/Dokumenter/NTNU/4år/Machinelearning/machine_learning/data/train_data/dataset_test_without_rev_features.csv')



In [40]:
x_train1 = df1.drop(["Unnamed: 0"], axis = 1)

x_train2 = df2.drop(["Unnamed: 0"], axis = 1)
 
x_train = df_train.drop(["Unnamed: 0"], axis = 1)

x_test = df_test.drop(["Unnamed: 0"], axis = 1)

x_test1 = x_test.set_index("store_id")

new_df = pd.DataFrame()
new_df["id"] = x_test1.index
new_df

Unnamed: 0,id
0,914206820-914239427-717245
1,916789157-916823770-824309
2,913341082-977479363-2948
3,889682582-889697172-28720
4,997991699-998006945-417222
...,...
8572,917323003-917383529-844309
8573,917353379-917411824-845904
8574,917072302-917089248-833647
8575,916960557-916993161-829908


# EDA analysis

In [35]:
""" 
fig, (ax1, ax2) = plt.subplots(figsize=(12, 3), ncols=2)
x_train2.isna().mean().plot.bar(ax=ax1)
ax1.set_title('Fraction of rows with NaN values')
x_train2.revenue.plot.hist(bins=100, ax=ax2)
ax2.set_title('Distribution of Revenues');
"""

" \nfig, (ax1, ax2) = plt.subplots(figsize=(12, 3), ncols=2)\nx_train2.isna().mean().plot.bar(ax=ax1)\nax1.set_title('Fraction of rows with NaN values')\nx_train2.revenue.plot.hist(bins=100, ax=ax2)\nax2.set_title('Distribution of Revenues');\n"

In [34]:
def svm_model(X, test): 
    y_train = X.revenue
    sklearn.utils.shuffle(X)
    sklearn.utils.shuffle(test)
    
    x_train = X.drop(["store_id", "revenue"], axis = 1)
    x_train1 = preprocessing.scale(x_train)
    y_test =  preprocessing.scale(test)
    

    imp_train = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_test =  SimpleImputer(missing_values=np.nan, strategy='mean')
    
    imp_train = imp_train.fit(x_train1)
    
    
    X_train_imp = imp_train.transform(x_train1)
   
    
    clf = svm.SVR(kernel='linear')
    clf.fit(X_train_imp,y_train)
    y_train_pred = clf.predict(X_train_imp)
    
    y_test_pred = clf.predict(imp_test)

    # Generate submission dataframe 
    # NOTE: It is important that the ID and predicted values match
    submission = pd.DataFrame()
    submission['id'] = imp_test.index
    submission['predicted'] = np.asarray(y_test_pred)

    # Save it to disk (`index=False` means don't save the index in the csv)
    submission.to_csv('sample_submission.csv', index=False)
    return submission 
    
    
    


    







In [None]:
submission = svm_model(x_train, x_test1) 
submission 

In [3]:
h2o.init(ip = "localhost", port = 54323) 

Checking whether there is an H2O instance running at http://localhost:54323 ..... not found.
Attempting to start a local H2O server...
; Eclipse OpenJ9 VM AdoptOpenJDK (build openj9-0.17.0, JRE 13 Windows 10 amd64-64-Bit 20191031_96 (JIT enabled, AOT enabled); OpenJ9   - 77c1cf708; OMR      - 20db4fbc; JCL      - c973c65658 based on jdk-13.0.1+9)
  Starting server from C:\Users\aminp\AppData\Local\Programs\Python\Python39\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\aminp\AppData\Local\Temp\tmpgiao1jmx
  JVM stdout: C:\Users\aminp\AppData\Local\Temp\tmpgiao1jmx\h2o_aminp_started_from_python.out
  JVM stderr: C:\Users\aminp\AppData\Local\Temp\tmpgiao1jmx\h2o_aminp_started_from_python.err


H2OServerError: Server wasn't able to start in 60.045552 seconds.