In [965]:
#import neccessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
import tqdm
import copy

<h1>Reading Data</h1>

In [984]:
file_name = "/Users/srikaavya/Downloads/wine.csv"
Data_frame = pd.read_csv(file_name)


In [985]:
Data_frame.shape

(6497, 12)

<h1>Stimulating missing values</h1>
<p>The Data contains extra spaces and dots and null values. Cleaning the data to remove these values.</p>

In [995]:
def generate_missing_data(data, n_col):
    # Get total no.of columns in data
    total_cols = data.shape[1]
    total_samples = data.shape[0]
    if n_col == 0:
        print("Need to give a value more than 0 for generating missing data")
        return
    r_cols = random.sample(range(total_cols), n_col)
    for i in r_cols:
        n_samples = random.sample(range(total_samples),int(random.randint(5, 25) * total_samples / 100))
        data.iloc[n_samples, i] = np.NaN

    return data

In [996]:
data = generate_missing_data(Data_frame,3)

#test = test.replace("?", np.nan)

In [988]:
# We are dropping all the rows with missing values and sending it for training.
train = data.dropna(how='any', axis=0)

In [989]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [990]:
# We get the rows with missing values
m_rows = data.isna().any(axis=1)
# store the rows with missing values in test dataframe
test = data[m_rows]


In [991]:
test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,,9.8,5
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,,9.5,7
35,7.8,0.645,0.0,5.5,0.086,5.0,,0.9986,3.4,0.55,9.6,6
49,5.6,0.31,0.37,1.4,0.074,12.0,,0.9954,3.32,0.58,9.2,5
50,8.8,0.66,0.26,1.7,0.074,4.0,,0.9971,3.15,0.74,9.2,5


<h1>Modeling</h1>

<h2>Linear Regression</h2>

In [992]:
# Method to find mean square error
def error_prediction(predicted_data, original_data):
    MSE = np.mean((original_data - predicted_data)**2)
    return MSE

def mean_center_normalize(X, l_input):
    # Converting whole data between 0 and 1
    X = (X - np.mean(X, 0)) / np.std(X, 0) 
    # Stacking extra 1's column as bias
    X = np.hstack((np.ones((l_input, 1)), X))
    return X

# Class which includes all the functions of linear regression

class Linear_Regression():

    def __init__(self):
        self.training_data = None
        self.testing_data = None
        self.weights = []
    
    # Function to calculates weights of the training model.
    def get_weights(self,X, Y):
        r, c = X.shape[0], X.shape[1]
        z = np.ones((r,1))
        X = np.append(X, z, axis=1)
        weights = np.dot(np.linalg.inv(np.dot(np.transpose(X), X)), np.dot(np.transpose(X), Y))
        return np.reshape(weights, (len(weights), -1))

    # Fitting linear regression model.
    def naive_linear_regression(self,data):
        # Initializing data
        self.data = data
        # Get all the columns with missing values
        missing_columns = self.data.columns[self.data.isna().any()]
        # Get all the rows with missing values
        missing_row_len = self.data.isnull().sum(axis=0)
        # Arrange according to no. of missing values in each column
        null_col = missing_row_len.nlargest(len(missing_columns))
        miss_data = null_col.head()
        null_cols = list(self.data.columns[self.data.isna().any()])
        # create a local object of data
        c_data = copy.deepcopy(self.data)
        c_null = null_cols
        # iterates null_cols from last to first
        for i in null_cols[::-1]:
            # Remove the current predicting model from null columns
            c_null.remove(i)
            # code for considering only one missing values column at a tome
            data = c_data[c_data.columns.drop(c_null)]  
            # Generate testing data
            testing_data = data[data.isna().any(axis=1)]
            # Generate training data
            training_data = data.dropna(how='any', axis=0)
            null_rows = testing_data.index
            testing_data = testing_data.dropna(axis=1)
            # Assigning X,Y values to get weights
            X = training_data.drop(i, axis=1)
            Y = training_data[i]
            # Calling get_weights method to get weight vector
            self.weights = self.get_weights(X,Y)
            # predict the missing values
            predicted_data = self.predict(testing_data, self.weights)
            # Merging the predicted y value in testing data
            testing_data[i] = predicted_data
            # Adding the missing values to actual data
            for j, r in enumerate(null_rows):
                c_data.loc[r, i] = testing_data.loc[r, i]
        return c_data

    # Method to predict y values based on 
    def predict(self,testing_data, w):
        X = mean_center_normalize(testing_data, testing_data.shape[0])
        predicted_value = np.dot(X, w)
        return predicted_value

#Instance of the class
feature = Linear_Regression()
#Fitting the model and predicting missing values
predicted_data = feature.naive_linear_regression(data)
missing_columns = data.columns[data.isna().any()]
original_data = pd.read_csv(file_name)
err = error_prediction(predicted_data[missing_columns],original_data[missing_columns])
print(err)


citric acid             6.921361e-02
total sulfur dioxide    6.298163e+06
sulphates               2.574772e+02
dtype: float64


<h1>Data generation<h1>

In [963]:
"""
def data_generation_using_linear_regression():
    single_row = pd.DataFrame(original_data.loc[0])
    synthetic_data = pd.DataFrame().reindex_like(single_row)
    random_col = random.sample(range(original_data.shape[1]), 5)
    random_col_name = original_data.columns[random_col[0]]
    samp = random.randint(0, original_data.shape[0])
    for col in random_col:
        synthetic_data.iloc[col] = train.iloc[samp,col]
    syn_list = synthetic_data.values.tolist()
    #print(list(np.concatenate(syn_list).flat))
    original_data.loc[original_data.shape[0]] = synthetic_data.values.tolist()
    predicted_data = feature.naive_linear_regression(original_data)
    return predicted_data
synthetic_data_generated = data_generation_using_linear_regression()
print(synthetic_data_generated)
"""

     fixed acidity volatile acidity citric acid residual sugar chlorides  \
0              7.4              0.7           0            1.9     0.076   
1              7.8             0.88           0            2.6     0.098   
2              7.8             0.76        0.04            2.3     0.092   
3             11.2             0.28        0.56            1.9     0.075   
4              7.4              0.7           0            1.9     0.076   
...            ...              ...         ...            ...       ...   
6493           6.6             0.32        0.36              8     0.047   
6494           6.5             0.24        0.19            1.2     0.041   
6495           5.5             0.29         0.3            1.1     0.022   
6496             6             0.21        0.38            0.8      0.02   
6497         [nan]           [0.52]       [nan]          [nan]   [0.034]   

     free sulfur dioxide total sulfur dioxide   density      pH sulphates  \
0         