## Import necessary tools

In [1]:
# !pip install --upgrade lightgbm
# !pip install pycodestyle flake8 pycodestyle_magic
# !conda install py-xgboost
%load_ext pycodestyle_magic

In [2]:
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import warnings
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import pickle

%matplotlib inline
warnings.filterwarnings("ignore")


The first thing I did was to read the data into a pandas dataframe

In [3]:
%%pycodestyle
data = pd.read_csv('../data_root/raw/wine_dataset.csv')


3:1: W391 blank line at end of file


In [4]:
%%pycodestyle
data.info()


3:1: W391 blank line at end of file


## My analysis will be done based on five aspects:

1. Exploratory Data Analysis (EDA)
2. Data Visualization
3. Feature Engineering
4. Model Building and Evaluation
5. Results and Conclusion

### Exploratory Data Analysis

Let's get a first look at our data

In [6]:
%%pycodestyle
data.head()


3:1: W391 blank line at end of file


We can see from the sample of our data above. We note that our data is mostly text, contains quite a number of missing values, has only 2 numeric columns and we are to predict one of them (points)

There are usually two kinds of bad data: duplicate and misssing.
I decided to check for duplicate data and delete them if they exist.
My assumption: If the data contains the same name, title and description, then it is the same data

In [7]:
%%pycodestyle
print("The total number of data samples: ", data.shape[0])
print("Duplicate data ", data[data.duplicated([
    'taster_name', 'title', 'description']
)].shape[0])


6:1: W391 blank line at end of file


we can see that there are 58 duplicate samples in our dataset so we have to drop them

In [8]:
%%pycodestyle
data = data.drop_duplicates(['taster_name', 'title', 'description'])


3:1: W391 blank line at end of file


Now we would need to take care of missing data, let's first check how many they are

In [9]:
%%pycodestyle
data.isnull().sum()


3:1: W391 blank line at end of file


That's quite a lot of missing data and we are going to have to take care of them

In [10]:
%%pycodestyle
# if there are any infinity values in the data,
# replace with NaN
data = data.replace([np.inf, -np.inf], np.nan)


5:1: W391 blank line at end of file


In [None]:
data.isnull().sum()


### Data Visualization

In [None]:
sns.countplot(data['points'])


We can see that all wine points are between 80 - 100 and most wine points are 88. Now lets see how price affects the points

In [12]:
%%pycodestyle
plt.figure(figsize=(10, 4))

g = sns.regplot(x='points', y='price', data=data, fit_reg=True)
g.set_title("Points x Price Distribuition", fontsize=20)
g.set_xlabel("Points", fontsize=15)
g.set_ylabel("Price", fontsize=15)

plt.show()


10:1: W391 blank line at end of file


As we can see from the above, the higher the price, the higher the probability of getting a high point is, and this seems quite logical

Now let us see how country affects price.

In [None]:
data = data[['price', 'points', 'country', 'province']].copy()


In [None]:
data.head()


**We have to be careful while generating the features in order to avoid data leakage**

In [21]:
%%pycodestyle
def handle_missing_values(data):
    data = data.fillna(data.mean())  # fill missing values with the mean
    # the rows which have country and province empty
    # can be done away with since they are only 6
    data = data.dropna()
    return data


def data_trans(df, place, obj, stat):
    return df.groupby(place)[obj].transform(stat)


def data_diff(df, col1, col2):
    return df[col1] - df[col2]


def generate_price_features(df):
    country_group = df.groupby('country')
    province_group = df.groupby('province')
#     df['price_per_country_mean'] = country_group['price'].transform('mean')
    df['price_per_country_mean'] = data_trans(
                                    df, 'country', 'price', 'mean'
                                    )
    df['price_per_country_mean_diff'] = data_diff(
                                            df, 'price', 'price_per_country_mean'
                                        )
#     df['price_per_country_mean_diff'] = df['price'] - df['price_per_country_mean']
    df['price_per_country_median'] = country_group['price'].transform('median')
    df['price_per_country_median_diff'] = df['price'] - df['price_per_country_median']
    df['price_per_province_mean'] = province_group['price'].transform('mean')
    df['price_per_province_mean_diff'] = df['price'] - df['price_per_province_mean']
    df['price_per_province_median'] = country_group['price'].transform('median')
    df['price_per_province_median_diff'] = df['price'] - df['price_per_province_median']
    
    return df


def generate_point_features(df):
    country_group = df.groupby('country')
    province_group = df.groupby('province')
    df['points_per_country_mean'] = country_group['points'].transform('mean')
    df['points_per_country_mean_diff'] = df['points'] - df['points_per_country_mean']
    df['points_per_country_median'] = country_group['points'].transform('median')
    df['points_per_country_median_diff'] = df['points'] - df['points_per_country_median']
    df['points_per_province_mean'] = province_group['points'].transform('mean')
    df['points_per_province_mean_diff'] = df['points'] - df['points_per_province_mean']
    df['points_per_province_median'] = country_group['points'].transform('median')
    df['points_per_province_median_diff'] = df['points'] - df['points_per_province_median']
    return df


13:1: E302 expected 2 blank lines, found 1
16:1: E302 expected 2 blank lines, found 1
24:80: E501 line too long (81 > 79 characters)
26:80: E501 line too long (84 > 79 characters)
28:80: E501 line too long (86 > 79 characters)
30:80: E501 line too long (84 > 79 characters)
31:80: E501 line too long (80 > 79 characters)
32:80: E501 line too long (88 > 79 characters)
33:1: W293 blank line contains whitespace
41:80: E501 line too long (85 > 79 characters)
42:80: E501 line too long (81 > 79 characters)
43:80: E501 line too long (89 > 79 characters)
45:80: E501 line too long (87 > 79 characters)
46:80: E501 line too long (82 > 79 characters)
47:80: E501 line too long (91 > 79 characters)
49:1: W391 blank line at end of file


In [16]:
%%pycodestyle
data = handle_missing_values(data)
data = generate_price_features(data)
data = generate_point_features(data)
data.isnull().sum()


6:1: W391 blank line at end of file


In [18]:
%%pycodestyle
data.head()


3:1: W391 blank line at end of file


#### Building a model to select the most important features; It is not the main model for prediction

select the generated features

In [19]:
%%pycodestyle
train_features = [
    'price',
    'price_per_country_mean',
    'price_per_country_mean_diff',
    'price_per_country_median',
    'price_per_country_median_diff',
    'price_per_province_mean',
    'price_per_province_mean_diff',
    'price_per_province_median',
    'price_per_province_median_diff',
    'points_per_country_mean',
    'points_per_country_median',
    'points_per_province_mean',
    'points_per_province_median',
]

target_feature = 'points'


19:1: W391 blank line at end of file


In [20]:
%%pycodestyle
df_train = data[train_features].copy().values
target = data[target_feature].copy().values


4:1: W391 blank line at end of file


### Split the dataset into train and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train, target, test_size=0.3)


## Time to build the model

I first build an xgb model for the dataset and test it on validation part using KFold

In [None]:
kf = KFold(n_splits=10, shuffle=True)
for train_index, test_index in kf.split(X_train):
    X_, X_valid = X_train[train_index], X_train[test_index]
    y_, y_valid = y_train[train_index], y_train[test_index]
    sc = StandardScaler()
    X_ = sc.fit_transform(X_)
    X_valid = sc.transform(X_valid)
    std_mean = sc.mean_ 
    std_var = sc.var_
    xgb_model = xgb.XGBRegressor(
                    n_estimators=1000,
                    max_depth=20,
                    importance_type="gain",
                    learning_rate=0.01,
                    n_jobs=4
                )
    xgb_model.fit(X_, y_,
                  early_stopping_rounds=5,
                  eval_set=[(X_valid, y_valid)],
                  eval_metric="rmse",
                  verbose=True)


Now we save the model to a pickle file

In [None]:
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(xgb_model, file)
print('saved')


### Evaluating the model

we first read the model file then begin evaluation

In [None]:
with open("pickle_model.pkl", 'rb') as file:
    recovered_lgb_model = pickle.load(file)


#### Predicting the values of the test set

In [None]:
X_test_std = (X_test - std_mean) / (std_var ** 0.5)
predictions = recovered_lgb_model.predict(X_test)


In [None]:
print(predictions)

In [None]:
error_ = mse(predictions, y_test)
print(error_)


### our mean square error gives us an error of 11.16

We check for feature importance to know which parameters were very important in creating the model

In [None]:
feature_importance = xgb_model.feature_importances_


In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importance, y=train_features)


#### We can see that important features that are used to generate the predictions so we select them

In [None]:
important_features = [
    'price',
    'price_per_country_mean',
    'price_per_country_mean_diff',
    'price_per_country_median',
    'price_per_country_median_diff',
    'price_per_province_mean',
    'price_per_province_mean_diff',
    'points_per_country_mean',
    'points_per_country_median',
    'points_per_province_mean'
]


**we would save these features to ensure persistence
so for every new data we get, we would use these features to generate the features**