In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
import tqdm

<h1>Reading Data</h1>

In [110]:
headers = ["CO(GT)","PT08.S1(CO)","NMHC(GT)","C6H6(GT)","PT08.S2(NMHC)","NOx(GT)","PT08.S3(NOx)","NO2(GT)","PT08.S4(NO2)","PT08.S5(O3)","T","RH","AH"]

In [111]:
file_name = "/Users/srikaavya/Downloads/AirQualityUCI/AirQualityUCI.xlsx"
Data_frame = pd.read_excel(file_name)


In [112]:
Data_frame.shape


(9357, 13)

<h1>Stimulating missing values</h1>
<p>The Data contains extra spaces and dots and null values. Cleaning the data to remove these values.</p>

In [113]:
def generate_missing_data(data, n_col):
    total_cols = data.shape[1]
    total_samples = data.shape[0]
    if n_col == 0:
        print("Need to give a value more than 0 for generating missing data")
        return
    r_cols = random.sample(range(total_cols), n_col)
    n_samples = random.sample(range(total_samples), random.randint(5, 25))
    for i in r_cols:
        n_samples = random.sample(range(total_samples),int(random.randint(5, 25) * total_samples / 100))
        data.iloc[:, i].where(data.iloc[n_samples, i] < 0, inplace=True)

    return data

In [114]:
data = generate_missing_data(Data_frame,3)
#test = test.replace("?", np.nan)

In [115]:
# We are dropping all the rows with missing values and sending it for training.
train = data.dropna(how='any', axis=0)

In [116]:
train.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
1,2.0,1292.25,112.0,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2.2,1402.0,88.0,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
6,1.2,1185.0,31.0,3.624399,689.5,62.0,1461.75,77.0,1332.75,732.5,11.325,56.775,0.760312
11,0.7,1066.0,8.0,1.133431,512.0,16.0,1918.0,28.0,1182.0,421.75,11.0,56.175,0.73656
12,0.7,1051.75,16.0,1.603768,553.25,34.0,1738.25,48.0,1221.25,471.5,10.45,58.125,0.735295


In [117]:
# We get the rows with missing values
m_rows = data.isna().any(axis=1)
# store the rows with missing values in test dataframe
test = data[m_rows]
list(test.columns[test.isna().any()])


['CO(GT)', 'NMHC(GT)', 'PT08.S5(O3)']

In [118]:
test.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360.0,,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
3,2.2,1375.5,,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,,1272.25,,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794
5,,1197.0,,4.741012,750.25,89.0,1336.5,96.0,1393.0,949.25,11.175,59.175,0.784772
7,,1136.25,31.0,3.326677,672.0,62.0,1453.25,76.0,1332.75,729.5,10.675,60.0,0.770238


<h1>Modeling</h1>

<h2>Linear Regression</h2>

In [119]:
def error_prediction(predicted_data, original_data, columns):
    MSE = np.square(np.subtract(predicted_data, original_data)).mean()
    return MSE

def fill_mean(data):
    null_col = self.data.columns[self.data.isna().any()]
    for i in null_col:
        mean_col = round(data.iloc[:, i].mean(), 2)
        data.iloc[:, i].fillna(mean_col, inplace=True)
    return data

class Linear_Regression():

    def __init__(self, data):
        self.data = data
        self.dropped_data = None
        self.out_data = None
        self.training_data = None
        self.testing_data = None

    def get_weights(self,X, Y):
        weights = np.dot((np.linalg.inv(np.dot(X.T, X))).T, (np.dot(X.T, Y)))
        return np.reshape(weights, (len(weights), -1))

    def naive_linear_regression(self):
        missing_columns = self.data.columns[self.data.isna().any()]
        missing_row_len = self.data.isnull().sum(axis=0)
        null_col = missing_row_len.nlargest(len(missing_columns))
        miss_data = null_col.head()
        null_cols = list(miss_data.index.values)
        # create a local temp copy of data
        c_data = self.data.copy()
        c_null = null_cols
        # iterates null_cols from last to first
        for i in null_cols[::-1]:
            c_null.remove(i)
            data = c_data[c_data.columns.drop(c_null)]  # this has only one col with null values
            testing_data = data[data.isna().any(axis=1)]
            training_data = data.dropna(how='any', axis=0)
            null_rows = testing_data.index
            testing_data = testing_data.dropna(axis=1)
            X = training_data.drop(i, axis=1)
            Y = training_data[i]
            W = self.get_weights(X,Y)
            predicted_data = self.predict(testing_data, W)
            testing_data[i] = predicted_data

            for j, r in enumerate(null_rows):
                c_data.loc[r, i] = testing_data.loc[r, i]
        return c_data


    def predict(self,testing_data, w):
        X = testing_data
        predicted_value = np.dot(X, w)
        return predicted_value

feature = Linear_Regression(data)
predicted_data = feature.naive_linear_regression()
print(predicted_data.isna().any())
null_value_cols = data.columns[data.isna().any()]
err = error_prediction(predicted_data,data,null_value_cols)
print(err)

CO(GT)           False
PT08.S1(CO)      False
NMHC(GT)         False
C6H6(GT)         False
PT08.S2(NMHC)    False
NOx(GT)          False
PT08.S3(NOx)     False
NO2(GT)          False
PT08.S4(NO2)     False
PT08.S5(O3)      False
T                False
RH               False
AH               False
dtype: bool
CO(GT)           0.0
PT08.S1(CO)      0.0
NMHC(GT)         0.0
C6H6(GT)         0.0
PT08.S2(NMHC)    0.0
NOx(GT)          0.0
PT08.S3(NOx)     0.0
NO2(GT)          0.0
PT08.S4(NO2)     0.0
PT08.S5(O3)      0.0
T                0.0
RH               0.0
AH               0.0
dtype: float64
