In [636]:
#import neccessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
import tqdm
import copy

<h1>Reading Data</h1>

In [637]:
file_name = "/Users/srikaavya/Downloads/wine.csv"
Data_frame = pd.read_csv(file_name)


In [638]:
Data_frame.shape

(6497, 12)

<h1>Stimulating missing values</h1>
<p>The Data contains extra spaces and dots and null values. Cleaning the data to remove these values.</p>

In [639]:
def generate_missing_data(data, n_col):
    # Get total no.of columns in data
    total_cols = data.shape[1]
    total_samples = data.shape[0]
    if n_col == 0:
        print("Need to give a value more than 0 for generating missing data")
        return
    r_cols = random.sample(range(total_cols), n_col)
    for i in r_cols:
        n_samples = random.sample(range(total_samples),int(random.randint(5, 10) * total_samples / 100))
        print(i, len(n_samples))
        data.iloc[n_samples, i] = np.NaN

    return data

In [640]:
data = generate_missing_data(Data_frame,3)
print(data.isnull().any())

#test = test.replace("?", np.nan)

1 584
9 389
6 324
fixed acidity           False
volatile acidity         True
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide     True
density                 False
pH                      False
sulphates                True
alcohol                 False
quality                 False
dtype: bool


In [641]:
# We are dropping all the rows with missing values and sending it for training.
train = data.dropna(how='any', axis=0)

In [642]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [643]:
# We get the rows with missing values
m_rows = data.isna().any(axis=1)
# store the rows with missing values in test dataframe
test = data[m_rows]
list(test.columns[test.isna().any()])


['volatile acidity', 'total sulfur dioxide', 'sulphates']

In [644]:
test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
6,7.9,0.6,0.06,1.6,0.069,15.0,,0.9964,3.3,0.46,9.4,5
12,5.6,,0.0,1.6,0.089,16.0,59.0,0.9943,3.58,0.52,9.9,5
13,7.8,0.61,0.29,1.6,0.114,9.0,,0.9974,3.26,1.56,9.1,5
16,8.5,,0.56,1.8,0.092,35.0,103.0,0.9969,3.3,0.75,10.5,7
22,7.9,,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,5


<h1>Modeling</h1>

<h2>Linear Regression</h2>

In [645]:
def error_prediction(predicted_data, original_data):
    MSE = np.mean((original_data - predicted_data)**2)
    return MSE

def mean_center_normalize(X, l_input):
    # Converting whole data between 0 and 1
    X = (X - np.mean(X, 0)) / np.std(X, 0) 
    X = np.hstack((np.ones((l_input, 1)), X))
    return X

class Linear_Regression():

    def __init__(self, data):
        self.data = data
        self.training_data = None
        self.testing_data = None

    def get_weights(self,X, Y):
        r, c = X.shape[0], X.shape[1]
        z = np.ones((r,1))
        X = np.append(X, z, axis=1)
        weights = np.dot(np.linalg.inv(np.dot(np.transpose(X), X)), np.dot(np.transpose(X), Y))
        return np.reshape(weights, (len(weights), -1))

    def naive_linear_regression(self):
        missing_columns = self.data.columns[self.data.isna().any()]
        missing_row_len = self.data.isnull().sum(axis=0)
        null_col = missing_row_len.nlargest(len(missing_columns))
        miss_data = null_col.head()
        null_cols = list(self.data.columns[self.data.isna().any()])
        # create a local temp copy of data
        c_data = copy.deepcopy(self.data)
        c_null = null_cols
        # iterates null_cols from last to first
        for i in null_cols[::-1]:
            c_null.remove(i)
            data = c_data[c_data.columns.drop(c_null)]  # this has only one col with null values
            testing_data = data[data.isna().any(axis=1)]
            training_data = data.dropna(how='any', axis=0)
            null_rows = testing_data.index
            testing_data = testing_data.dropna(axis=1)
            X = training_data.drop(i, axis=1)
            Y = training_data[i]
            W = self.get_weights(X,Y)
            predicted_data = self.predict(testing_data, W)
            testing_data[i] = predicted_data
            for j, r in enumerate(null_rows):
                c_data.loc[r, i] = testing_data.loc[r, i]
        return c_data


    def predict(self,testing_data, w):
        X = mean_center_normalize(testing_data, testing_data.shape[0])
        predicted_value = np.dot(X, w)
        return predicted_value

feature = Linear_Regression(data)
predicted_data = feature.naive_linear_regression()
print(data.isnull().any())
missing_columns = data.columns[data.isna().any()]
original_data = pd.read_csv(file_name)
err = error_prediction(predicted_data[missing_columns],original_data[missing_columns])
print(err)


fixed acidity           False
volatile acidity         True
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide     True
density                 False
pH                      False
sulphates                True
alcohol                 False
quality                 False
dtype: bool
volatile acidity        3.334958e+02
total sulfur dioxide    5.733787e+06
sulphates               2.972822e+02
dtype: float64


NMHC(GT)         2118.993467     1777
C6H6(GT)            0.781248     1964
PT08.S3(NOx)    17978.876589     2994
NO2(GT)           417.579479     1590
PT08.S4(NO2)     6434.840880     3274
T                   5.738153     1497
7 1590
6 2994
2 1777
8 3274
10 1497
3 1964