In [166]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import sys

In [None]:
# # Load the Wine Dataset
df_wine = pd.read_csv("data/winemag-data-130k-v2.csv", encoding = 'utf8')

In [None]:
df_wine.head(3)

In [None]:
#df_wine = df_wine.drop('points', 1)

In [None]:
print(df_wine.dtypes)

In [None]:
# Finding the number of rows with NULL values
df_wine.isnull().sum()

In [None]:
# DROP region_2 column since most of the records NULL and we have region_1
df_wine = df_wine.drop('region_2', 1)

In [None]:
# DROP rows with NULL values
df_wine=df_wine.dropna()
df_wine.shape

In [None]:
df_wine['price'].describe()

# Convert categorical COUNTRY column to Numeric

In [None]:
df_wine['country'].unique()

In [None]:
df_country = df_wine.country.str.get_dummies() # get the pivot of the country attribte
df_country.columns = ['from_' + col for col in df_country.columns] # naming the columns
df_wine = pd.concat([df_wine, df_country], axis=1) # combine main dataframe with country matrix dataframe
df_wine = df_wine.drop('country', axis=1) # drop the country attribute since different attributes per country value
df_wine.head(3)

In [None]:
print(df_wine.dtypes)

In [None]:
# DEFINE X and Y VARIABLES
df_X = df_wine.loc[:, 'from_Argentina':'from_US']
df_X = pd.concat([df_wine.price, df_X], axis=1)
df_Y = df_wine[['points']]
print(df_X.shape)


In [None]:
# devide the dataframe for training and testing 
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.20)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)


<h3>Clean outliers from Test Data</h3>

In [None]:
# Clean test data
def reject_outliers(data, m=2):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

# ############################ REMOVE OUTLIERS FROM TESTING ##########################
print("test data with outliers", X_test.shape, y_test.shape)
# clean prices from testing
test_prices_wo_outliers = reject_outliers(X_test.values)
# remove points from test set if price was an outlier
min_price = min(test_prices_wo_outliers)
max_price = max(test_prices_wo_outliers)

test_data = pd.concat([X_test, y_test], axis=1)
test_data_clean = test_data.drop(test_data[test_data.price < min_price].index)
test_data_clean = test_data_clean.drop(test_data_clean[test_data_clean.price > max_price].index)

print(test_data_clean.shape)

X_test = test_data_clean[test_data_clean.columns[:-1].tolist()]
y_test = pd.DataFrame(test_data_clean['points'])
print("test data without outliers", X_test.shape, y_test.shape)
#######################################################################################
print(type(y_test))


<h3>Regression WITH outliers in Training data (outlier-clean Test)</h3>

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
y_predictions = lm.predict(X_test)

w = model.coef_ # parameters of model
b = model.intercept_ #intercept of model

print("coeficient: ", w)
print("intercept: ", b)

In [None]:
from copy import copy, deepcopy
# store results
ax1_y_test = deepcopy(y_test)
ax1_y_predictions = deepcopy(y_predictions)


In [None]:
# Accuracy score
print ('Accuracy:', model.score(X_test, y_test))
# Root mean squared error (RMSE)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_predictions)))
# Mean absolute error (MAE)
mae_with_outliers = metrics.mean_absolute_error(y_test, y_predictions)
print('MAE:', mae_with_outliers)


<h3>Regression W/O outliers in Training data (outlier-clean Test)</h3>

In [None]:
print(type(X_train))
# remove outliers from training data
print("training data with outliers", X_train.shape, y_train.shape)

training_prices_wo_outliers = reject_outliers(X_train['price'].values)
# remove points from test set if price was an outlier
min_price = min(training_prices_wo_outliers)
max_price = max(training_prices_wo_outliers)


training_data = pd.concat([X_train, y_train], axis=1)
training_clean = training_data.drop(training_data[training_data.price < min_price].index)
training_clean = training_clean.drop(training_clean[training_clean.price > max_price].index)

X_train = training_clean[training_clean.columns[:-1].tolist()]
y_train = training_clean['points'].values.reshape(-1, 1)
print("training data without outliers", X_train.shape, y_train.shape)


# apply linear regression again
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
y_predictions = lm.predict(X_test)

w = model.coef_ # parameters of model
b = model.intercept_ #intercept of model

print("coeficient: ", w)
print("intercept: ", b)

In [None]:
from copy import copy, deepcopy
# store results
ax2_y_test = deepcopy(y_test)
ax2_y_predictions = deepcopy(y_predictions)


In [None]:
# Accuracy score
print ('Accuracy:', model.score(X_test, y_test))
# Root mean squared error (RMSE)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_predictions)))
# Mean absolute error (MAE)
mae_without_outliers = metrics.mean_absolute_error(y_test, y_predictions)
print('MAE:', mae_without_outliers)


<h3>SUMMARY</h3>

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
# plot w/o outliers in training data
ax1.scatter(ax1_y_test, ax1_y_predictions)
ax1.set(xlabel='True Values', ylabel='Predictions')

max_y_predictions = int(max(ax1_y_predictions))
min_y_predictions = int(min(ax1_y_predictions))

print("--> With outliers in Training Set: ")
print('max_y_predictions: ', max_y_predictions)
print('min_y_predictions: ', min_y_predictions)
print()

max_y_true = int(max(ax1_y_test.points))
min_y_true = int(min(ax1_y_test.points))

ax1.axhline(max_y_true,label='max true value', color="red")

ax1.legend()
ax1.grid()
ax1.set_xlim([80,100])
ax1.set_ylim([80,100])
ax1.set_title('With outliers in Training Set')

# plot w/o outliers in training data
ax2.scatter(ax1_y_test, ax1_y_predictions)
ax2.set(xlabel='True Values', ylabel='Predictions')

max_y_predictions = int(max(ax2_y_predictions))
min_y_predictions = int(min(ax2_y_predictions))

print("--> Without outliers in Training Set:")
print('max_y_predictions: ', max_y_predictions)
print('min_y_predictions: ', min_y_predictions)

max_y_true = int(max(ax2_y_test.points))
min_y_true = int(min(ax2_y_test.points))

ax2.axhline(max_y_true,label='max true value', color="red")

ax2.legend()
ax2.grid()
ax2.set_xlim([80,100])
ax2.set_ylim([80,100])
ax2.set_title('Without outliers in Training Set')

plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=None)



<h1>Information gain</h1>

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X_train, y_train)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X_train.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
print(embeded_rf_feature)

<h3>Regression with only best features</h3>

In [None]:
# pd.concat([X_train, y_train], axis=1)
# # selected_features_X_train = pd.concat(X_train.price.values, X_train.from_Italy.values)
print(type(X_train))
print(type(y_train))
selected_features_X_train = X_train[['price','from_Italy', 'from_US']]
selected_features_X_test = X_test[['price','from_Italy', 'from_US']]
print(type(selected_features_X_train))


# apply linear regression again
lm = linear_model.LinearRegression()
model = lm.fit(selected_features_X_train, y_train)
y_predictions = lm.predict(selected_features_X_test)

w = model.coef_ # parameters of model
b = model.intercept_ #intercept of model

print("coeficient: ", w)
print("intercept: ", b)

# Plot the results
plt.scatter(y_test, y_predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
max_y_predictions = int(max(y_predictions))
min_y_predictions = int(min(y_predictions))
max_y_true = int(max(y_test.points))
min_y_true = int(min(y_test.points))
plt.axhline(max_y_true,label='max true value', color="red")
plt.legend()
plt.grid()
plt.ylim([80,150])

# Accuracy score
print ('Accuracy:', model.score(selected_features_X_test, y_test))
# Root mean squared error (RMSE)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_predictions)))
# Mean absolute error (MAE)
print('MAE:', metrics.mean_absolute_error(y_test, y_predictions))
