In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
from plotly.subplots import make_subplots
from plotly.offline import iplot, plot, init_notebook_mode
import plotly.express as px
import copy

init_notebook_mode(connected=True)
repo = ('C:/Users/Public/Public Datasets/UCI/Wine Quality/winequality-red.csv')
columns = ('fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides',
           'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality')

df = pd.read_csv(repo, sep=';', names=columns, skipinitialspace=False)
repeat_set_values = copy.copy(df)
df_validate = df.copy()

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1599 non-null   float64
 1   volatile_acidity      1599 non-null   float64
 2   citric_acid           1599 non-null   float64
 3   residual_sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free_sulfur_dioxide   1599 non-null   float64
 6   total_sulfur_dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [3]:
GBs = [(df.groupby(i).agg(
    {df.columns[-1]: [np.min, np.max, np.mean, np.var, 'count']})) for i in df.columns[0:-1]]
Allcolumns = [i for i in df.columns]
Ifnulls = df.isnull().sum().any()
Nulls = [df[str(i)].isnull().sum() for i in df.columns]
Allnulls = df.isnull().sum()

In [4]:
df_stats = df.describe().T

In [5]:
def NumAssignment():
    np.random.seed(seed=42)
    for i in range(0, 100):
        k = np.random.randint(0, len(df))
        df.loc[k, Allcolumns[-1]] = np.nan

In [6]:
def StatsCounter(x, y):

    r = np.sum((x - np.mean(x)) * (y - np.mean(y))) / \
        (sqrt((np.sum((x - np.mean(x))**2))) * sqrt((np.sum((y - np.mean(y))**2))))

    SSx = (sum(x * x) - (((sum(x))**2) / len(x)))
    SSy = (sum(y * y) - (((sum(y))**2) / len(y)))
    SSxy = sum(x * y) - (1 / len(x)*(sum(x))*(sum(y)))

    r2 = (((sum((x-np.mean(x))*(y-np.mean(y)))) /
           sqrt((sum(x * x) - (((sum(x))**2) / len(x)))*(sum(y * y) - (((sum(y))**2) / len(y))))) ** 2)

    return SSx, SSy, r, SSxy, r2

In [7]:
def FourScatterPlotly():
    r = 'scatter'
    t = 2, 32
    a = df[Allcolumns[-1]]
    rows, cols = [t[1] for i in range(2)]
    scatter_specs = [[[{'type': r}, {'type': r}], [
        {'type': r}, {'type': r}]] for i in r]
    scatter_cols = [df[Allcolumns[1]], df[Allcolumns[4]], df[Allcolumns[7]], df[Allcolumns[9]]]
    fig01 = make_subplots(rows=2, cols=2, specs=scatter_specs[0], shared_yaxes=True)

    f1 = 'markers'
    f2 = 'lines+markers'
    g = ['r = -0.194886 \n', 'r = -0.209264 \n', 'r = -0.305694 \n', 'r = 0.432296 \n']

    palette = ['#BB86FC', 'rgb(255, 161, 90)', '#03DAC5', '#FF0266']
    grayscale = ['#121212', '#332940', 'whitesmoke']
    s_all = [i for i in [scatter_cols]][0]

    fig01.add_scatter(x=s_all[0], y=a, mode=f1, name=g[0],
                      marker={'color': palette[0]}, row=1, col=1)
    fig01.add_scatter(x=s_all[1], y=a, mode=f1, name=g[1],
                      marker={'color': palette[1]}, row=1, col=2)
    fig01.add_scatter(x=s_all[2], y=a, mode=f1, name=g[2],
                      marker={'color': palette[2]}, row=2, col=1)
    fig01.add_scatter(x=s_all[3], y=a, mode=f1, name=g[3],
                      marker={'color': palette[3]}, row=2, col=2)

    fig01.update_layout({'title': {'text': 'Impact on quality', 'font': {'size': t[1]}},
                         'showlegend': True, 'font': {'color': grayscale[2]},
                         'paper_bgcolor': grayscale[0], 'plot_bgcolor': grayscale[1]})

    gridcolor = ['#121212', '#1F1b24']

    fig01.update_yaxes({'title': {'text': 'Quality of Wine'},
                        'gridcolor': gridcolor[1], 'fixedrange': True})
    fig01.update_xaxes({'gridcolor': gridcolor[1], 'zeroline': False,
                        'fixedrange': True})
    fig01.update_xaxes({'title': {'text': 'Volatile Acidity'}, 'tickmode': 'auto'}, row=1, col=1)
    fig01.update_xaxes({'title': {'text': 'Chlorides'}, 'tickmode': 'auto'}, row=1, col=2)
    fig01.update_xaxes({'title': {'text': 'Density'}, 'tickmode': 'auto'}, row=2, col=1)
    fig01.update_xaxes({'title': {'text': 'Sulphates'}, 'tickmode': 'auto'}, row=2, col=2)

    iplot(fig01)

In [8]:
NumAssignment()

In [9]:
CustomGB_11 = df_validate.groupby('quality').agg({'density': [np.mean, np.min, np.max, np.std]}, {
    'volatile_acidity': [np.mean, np.min, np.max, np.std]})

In [10]:
FourScatterPlotly()

In [11]:
df_iterative = copy.copy(df)
df_interpolation = copy.copy(df)
df_rfsampling = copy.copy(df)

df_rfsampling.index.name = 'sample'
df_rfsampling = df_rfsampling.reset_index()

In [12]:
def TrainTestSplit():

    train = df_rfsampling[df_rfsampling.quality.isnull() == False]
    test = df_rfsampling[df_rfsampling.quality.isnull() == True]

    X_train = train.drop(['quality'], 1)
    Y_train = np.ravel(train[['quality']])

    X_test = test.drop(['quality'], 1)

    return df_rfsampling, train, test, X_train, Y_train, X_test

In [13]:
df_rfsampling, train, test, X_train, Y_train, X_test = TrainTestSplit()

In [14]:
columns_T = np.ravel(df[df.quality.isnull() == True].T.columns)
df_validate.index.name = 'sample'
df_interpolation.index.name = 'sample'
df_iterative.index.name = 'sample'
df_iterative = df_iterative.reset_index()

In [15]:
def SamplingMethods():

    iterative = IterativeImputer(random_state=42)

    df_iterative_predictions = pd.DataFrame(
        iterative.fit_transform(df_iterative), columns=df_iterative.columns)

    df_interpolation.quality = df_interpolation.quality.fillna(
        df_interpolation.quality.interpolate(method='linear'))

    rf = RandomForestRegressor(n_estimators=200, random_state=42)
    rf.fit(X_train, Y_train)

    rfpreds = rf.predict(X_test)
    X_test['quality'] = rfpreds
    df_rfpredictions = train.append(X_test)
    df_rfpredictions = df_rfpredictions.sort_values(by=['sample'], ascending=True)

    return df_iterative_predictions, df_interpolation, df_rfpredictions

In [16]:
df_iterative, df_interpolation, df_rfpredictions = SamplingMethods()

In [17]:
def TransposeValidation():

    df_validate_T = df_validate.reset_index()
    missingvalues = columns_T
    df_validate_T = df_validate_T.T[missingvalues]

    df_iterative_T = df_iterative.reset_index()
    df_iterative_T = df_iterative_T.T[missingvalues]
    df_interpolation_T = df_interpolation.reset_index()
    df_interpolation_T = df_interpolation_T.T[missingvalues]
    df_rfpredictions_T = df_rfpredictions.T[missingvalues]
    return df_validate_T, df_iterative_T, df_interpolation_T, df_rfpredictions_T, missingvalues

In [18]:
df_validate_T, df_iterative_T, df_interpolation_T, df_rfpredictions_T, missingvalues = TransposeValidation()

In [19]:
def SimpleScatterPlotly():

    t = 2, 32
    rows, cols = [1 for i in range(2)]
    scatter_cols = [df_validate_T.loc['sample', ::], df_validate_T.loc['quality', ::], df_iterative_T.loc['sample', ::],
                    df_iterative_T.loc['quality', ::], df_interpolation_T.loc['sample', ::], df_interpolation_T.loc['quality', ::], df_rfpredictions_T.loc['sample', ::], df_rfpredictions_T.loc['quality', ::]]

    f1 = 'markers'
    f2 = 'lines+markers'
    g = ['Validation set', 'Iterative', 'Interpolation', 'Rf sampling']

    palette = ['#BB86FC', 'rgb(255, 161, 90)', '#6200EE', '#03DAC5']
    grayscale = ['#332940', '#121212', 'whitesmoke']
    s_all = [i for i in [scatter_cols]][0]

    fig02 = make_subplots(rows=rows, cols=cols, specs=[[{'type': 'scatter'}]], shared_yaxes=True)

    fig02.add_scatter(x=s_all[-8],
                      y=s_all[1],
                      mode=f1,
                      name=g[0],
                      marker={'color': palette[0]},
                      row=rows, col=cols)

    fig02.update_layout({'title': {'text': 'Quality of wine, FAF',
                                   'font': {'size': t[1]}}, 'showlegend': True,
                         'font': {'color': grayscale[-1]},
                         'paper_bgcolor': grayscale[1],
                         'plot_bgcolor': grayscale[0]
                         })

    gridcolor = ['#121212', '#1F1b24']

    fig02.update_xaxes({'title': {'text': 'Sample'},
                        'gridcolor': gridcolor[1], 'zeroline': False
                        })
    fig02.update_yaxes({'title': {'text': 'Quality'},
                        'gridcolor': gridcolor[1], 'zeroline': False
                        })

    fig03 = make_subplots(rows=rows, cols=cols, specs=[[{'type': 'scatter'}]])

    fig03.add_scatter(x=s_all[-8],
                      y=s_all[1],
                      mode=f1,
                      name=g[0],
                      marker={'color': palette[0]},
                      row=1, col=cols)

    fig03.add_scatter(x=s_all[2],
                      y=s_all[3],
                      mode=f1,
                      name=g[1],
                      marker={'color': palette[1]},
                      row=1, col=cols)

    fig03.add_scatter(x=s_all[4],
                      y=s_all[5],
                      mode=f1,
                      name=g[2],
                      marker={'color': palette[2]},
                      row=1, col=cols)

    fig03.add_scatter(x=s_all[6],
                      y=s_all[7],
                      mode=f1,
                      name=g[3],
                      marker={'color': palette[3]},
                      row=1, col=1)

    fig03.update_layout({'title': {'text': 'Predicting quality of wine, multiple methods',
                                   'font': {'size': t[1]}}, 'showlegend': True,
                         'font': {'color': grayscale[-1]},
                         'paper_bgcolor': grayscale[1],
                         'plot_bgcolor': grayscale[0]
                         })

    fig03.update_xaxes({'title': {'text': 'Sample'},
                        'gridcolor': gridcolor[1], 'zeroline': False
                        })
    fig03.update_yaxes({'title': {'text': 'Quality'},
                        'gridcolor': gridcolor[1], 'zeroline': False
                        })

    iplot(fig02)
    iplot(fig03)

In [20]:
SimpleScatterPlotly()

In [21]:
baseline = []

In [22]:
def MethodScores():

    baseline_scores_of_probability = list(abs(np.random.choice(repeat_set_values.quality) - np.random.randint(3, 9))
                                          for i in range(0, len(df[df.quality.isnull() == True])))

    baseline.append(np.mean(baseline_scores_of_probability))

    iterative_scores = [
        abs(df_validate.loc[i, 'quality'] - df_iterative.loc[i, 'quality']) for i in missingvalues]
    interpolate_scores = [
        abs(df_validate.loc[i, 'quality'] - df_interpolation.loc[i, 'quality']) for i in missingvalues]
    rfsampling_scores = [
        abs(df_validate.loc[i, 'quality'] - df_rfpredictions.loc[i, 'quality']) for i in missingvalues]
    return baseline_scores_of_probability, iterative_scores, interpolate_scores, rfsampling_scores

In [23]:
baseline_scores, iterative_scores, interpolate_scores, rfsampling_scores = MethodScores()

In [24]:
baseline = [np.sum(baseline_scores)]
df_validate.isnull().sum().any()

iterative_scores_agg = np.sum(iterative_scores)
interpolate_scores_agg = np.sum(interpolate_scores)
rfsampling_scores_agg = np.sum(rfsampling_scores)

baseline_frame = pd.DataFrame({'A': baseline_scores})
iterative_frame = pd.DataFrame({'B': iterative_scores})
interpolate_frame = pd.DataFrame({'C': interpolate_scores})
rfsampling_frame = pd.DataFrame({'D': rfsampling_scores})


iterative_scores_agg

48.52118797478751

In [25]:
scores_frame = pd.concat([baseline_frame, iterative_frame, interpolate_frame, rfsampling_frame], 1)

scores_frame[5::80]

rollingmean_df = pd.DataFrame({})

rollingmean_df['baseline_rollingmean'] = baseline_frame.A.rolling(
    window=15, min_periods=1).mean()
rollingmean_df['iterative_rollingmean'] = iterative_frame.B.rolling(
    window=15, min_periods=1).mean()
rollingmean_df['interpolate_rollingmean'] = interpolate_frame.C.rolling(
    window=15, min_periods=1).mean()
rollingmean_df['rfsampling_rollingmean'] = rfsampling_frame.D.rolling(
    window=15, min_periods=1).mean()

rollingmean_df[80::2]

Unnamed: 0,baseline_rollingmean,iterative_rollingmean,interpolate_rollingmean,rfsampling_rollingmean
80,1.533333,0.49036,0.733333,0.380333
82,1.533333,0.512024,0.7,0.416
84,1.466667,0.516904,0.733333,0.468667
86,1.466667,0.65795,0.933333,0.661
88,1.8,0.661908,0.855556,0.673667
90,1.866667,0.623962,0.844444,0.623333
92,1.866667,0.569868,0.644444,0.501667
94,1.733333,0.634877,0.722222,0.517667
96,1.8,0.602069,0.744444,0.541667


In [31]:
def MasterScatterPlotly():

    t = 2, 32
    rows, cols = [1 for i in range(2)]
    bars_cols = [iterative_scores_agg, interpolate_scores_agg, rfsampling_scores_agg]
    bars_labels = ['iterative', 'interpolation', 'rfsampling', 'baseline']

    fig04 = make_subplots(rows=rows, cols=cols, specs=[[{'type': 'bar'}]])

    fig04.add_bar(x=[bars_labels[0]],
                  y=[100 - bars_cols[-3]+abs(baseline[0] + 10)],
                  name=bars_labels[0],
                  row=rows, col=cols)

    fig04.add_bar(x=[bars_labels[1]],
                  y=[100 - bars_cols[-2]+abs(baseline[0] + 10)],
                  name=bars_labels[1],
                  row=rows, col=cols)

    fig04.add_bar(x=[bars_labels[2]],
                  y=[100 - bars_cols[-1]+abs(baseline[0] + 10)],
                  name=bars_labels[2],
                  row=rows, col=cols)

    fig04.add_bar(x=[bars_labels[3]],
                  y=[100 - +10],
                  name=bars_labels[3],
                  row=rows, col=cols)

    line_xs = [np.arange(0, 50, 2), np.arange(0, 100, 1)]
    line_ys = [baseline_frame.A, iterative_frame.B, interpolate_frame.C, rfsampling_frame.D]
    line_labels = ['Baseline deviation', 'Iterative', 'Interpolation', 'Rfsampling']

    f1 = 'markers'
    f2 = 'lines+markers'
    f3 = 'lines'
    g = ['Validation set', 'Iterative', 'Interpolation', 'Rf sampling']

    palette = ['#BB86FC', 'rgb(255, 161, 90)', '#6200EE', '#03DAC5']
    grayscale = ['#332940', '#121212', 'whitesmoke']

    fig05 = make_subplots(rows=rows, cols=cols, specs=[[{'type': 'scatter'}]])

    fig05.add_scatter(x=line_xs[1],
                      y=line_ys[0],
                      name=line_labels[0],
                      mode=f1,
                      marker={'color': palette[1]}
                      )

    fig05.add_scatter(x=line_xs[1],
                      y=line_ys[1],
                      name=line_labels[1],
                      mode=f3,
                      marker={'color': palette[0]}
                      )

    fig05.add_scatter(x=line_xs[1],
                      y=line_ys[2],
                      name=line_labels[2],
                      mode=f3,
                      marker={'color': palette[2]}
                      )

    fig05.add_scatter(x=line_xs[1],
                      y=line_ys[3],
                      name=line_labels[3],
                      mode=f3,
                      marker={'color': palette[3]}
                      )

    fig05.update_layout({'title': {'text': 'Deviation from validation frame',
                                   'font': {'size': t[1]}}, 'showlegend': True,
                         'font': {'color': grayscale[2]},
                         'paper_bgcolor': grayscale[1],
                         'plot_bgcolor': grayscale[0]
                         })

    gridcolor = ['#121212', '#1F1b24']

    fig05.update_xaxes({'title': {'text': 'Location in transposed set'},
                        'gridcolor': gridcolor[1], 'zeroline': False
                        })
    fig05.update_yaxes({'title': {'text': 'Deviation from value'},
                        'gridcolor': gridcolor[1], 'zeroline': False
                        })

    fig04.update_layout({'title': {'text': 'Scores of each method',
                                   'font': {'size': t[1]}}, 'showlegend': False, })
    fig04.update_yaxes({'title': {'text': 'Baseline differential'}, })
    fig04.update_xaxes({'title': {'text': 'Prediction models'}, })

    iplot(fig04)

In [32]:
MasterScatterPlotly()

In [28]:
pd.set_option('display.max_columns', None)
# print(df[:5])
# print(GBs)
# print(df.quality.isnull().sum().any())
# print(df[df.quality.isnull()][:5])
# print(Allcolumns)
# print(specs)