# *importing the libraries*


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection  import train_test_split

# 1. reading the datasets

In [None]:
df = pd.read_csv('fuel.csv')
df.head()

# 2. EDA


#  2.1 describing the dataset

In [None]:
# df.describe()


# 2.2 histogram : data distribution

In [None]:
df.hist(figsize = (3,3),color=['purple'])
 plt.show()

# 2.3 pairplot

In [None]:
sns.pairplot(data = df,hue='CYLINDERS')
plt.show()

# 2.4 correlation matrix

In [None]:
n_df = df.select_dtypes(include='number')
corrmatrix = n_df.corr()
print(corrmatrix)


# 2.5 heatmap

In [None]:
sns.heatmap(df.corr(),annot = True,cmap = 'BuPu')
plt.show()

# 3. preprocessing

# 3.1 Checking null values

In [None]:
 df.info()

# 3.2 deleting duplicate records/rows

In [None]:
df.duplicated().any()

# 3.3 (a) feature scaling

In [None]:
d = df.iloc[:,[4,5,8,9,10,11,12]]

In [None]:
normalized = (d - d.min())/(d.max()-d.min())
normalized.head()

# 3.3 (b) z-score indexing

In [None]:
# we want we perform feature scaling as we donot want a set of features ( features taking larger scale of values ) to dominate the prediction / estimation as compared to  other set of features ( features taking lower scale of values ).
# (x - xmean)/std

In [None]:
zscore = (d - d.mean())/d.std()
zscore.head()

In [None]:
sc = StandardScaler()
XScaled = sc.fit_transform(zscore)
XScaled

In [None]:
extracted = zscore.iloc[:,[0]]
extracted.head()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(extracted.iloc[:,0].values,df.loc[:,'CO2EMISSIONS'].values,test_size = 0.2,random_state = 101) 
x_train.size,x_test.size,y_train.size,y_test.size

In [None]:
w = (np.mean(x_train * y_train) - (np.mean(x_train) * np.mean(y_train))) / (np.mean(x_train ** 2) - (np.mean(x_train))** 2)
w

In [None]:
b = y_train.mean() - w * x_train.mean()
b

In [None]:
plt.scatter(x_train,y_train,c='grey')
y_pred = ( w * x_train + b )
plt.plot(x_train,y_pred, c =  'red')
plt.show()

# 3.3 (c) normal equation method.

In [None]:
# W = (X^TX)^-1 . X^TY
Y = y_train.reshape(853,1)
X = x_train.reshape(853,1)
onesArray = np.ones(X.shape)
Y.shape,X.shape

In [None]:
X = np.hstack((onesArray,X))
inverse = np.linalg.inv(X.T.dot(X))

dablu = inverse.dot(X.T.dot(Y))
dablu
# inverse

# gradient descent method

In [None]:
def computeCost(X, Y, w, b):
    #### Compute cost J
    J = (((w*np.mean(X) + b) - np.mean(Y))**2)/(2*X.shape[0])
    return J

def gradientDescent (X, Y, learningRate, numIterations):
    w, b = 0, 0
    errorList =[] ### Store cost in each iteration
    for i in range(numIterations):
        print(f'iteration {i} value pf w = {w} and b = {b}')
#         Y_pred = #COMPUTE f(X)
        ### compute updated w and b
        costJ = computeCost(X, Y, w, b)
        errorList.append(costJ)
        wc = w - learningRate*(w*(np.mean((X)**2)) + b*np.mean(X) - np.mean(X*Y))
        bc = b - learningRate * (w* np.mean(X) + b - np.mean(Y))
        w = wc
        b = bc
    return (w,b,errorList)

In [None]:
#w,b,errorList = gradientDescent (x_train, y_train, 0.001, numIterations=5000)
# Match w and b against the ones obtained in A and B part
# Plot iteration vs error

# multiple regression

In [None]:
tempX = df.loc[:,['ENGINESIZE','FUELCONSUMPTION_COMB']]
tempX.shape[0]
tempY = df[['CO2EMISSIONS']]
multxScaled = (tempX - tempX.mean())/ tempX.std()
multxScaled

In [None]:
xMult_train,xMult_test,yMult_train,yMult_test =  train_test_split(multxScaled,tempY,test_size = 0.2,random_state = 101)
xMult_train

In [None]:
multiX = np.hstack((np.ones((xMult_train.shape[0],1)),xMult_train))
multiX

In [None]:

T1 = np.linalg.inv(multiX.T.dot(multiX))

T2 = multiX.T.dot(yMult_train)
multW = T1.dot(T2)
multW

# gradient descent for multivarient

In [None]:
n = len(X)
print(xMult_train)
def gradientDescent (X, Y, learningRate, numIterations):
    w1,w2 = 0,0
    b = 0
    errorList =[] 
    for i in range(numIterations):
        print("X : ",X,Y)
        Y_pred = w1 * X.iloc[:,0] + w2 * X.iloc[:,1] + b
        print(Y.values.squeeze().shape)
        ### compute updated w and b
        wd1 = (2/n) * np.sum((Y_pred - Y)*X.iloc[:,0])
        wd2 = (2/n) * np.sum((Y_pred - Y)*X.iloc[:,1])
        bd = (2/n) * np.sum(Y_pred - Y)
        costJ = (1/n) * np.sum([val**2 for val in (Y.values.squeeze() - Y_pred)])
        w1 = w1 - learningRate * wd1
        w2 = w2 - learningRate * wd2
        b = b - learningRate * bd
        errorList.append(costJ)
    return (w1,w2,b,errorList)

In [None]:
w1,w2,b,errorList = gradientDescent (xMult_train, yMult_train, 0.001, numIterations=2)
# Match w and b against the ones obtained in A and B part
# Plot iteration vs error

In [None]:
w1, w2, b = 0,0,0

In [None]:
[val for val in (yMult_train - (w1 * xMult_train.iloc[:,0] + w2 * xMult_train.iloc[:,1] + b))]

In [None]:
yMult_train. - (w1 * xMult_train.iloc[:,0] + w2 * xMult_train.iloc[:,1] + b)

In [None]:
yMult_train.values.squeeze() - (w1 * xMult_train.iloc[:,0] + w2 * xMult_train.iloc[:,1]) + b