In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline  
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("household_power_consumption.csv")

In [None]:
data.head()

In [None]:
data.columns

### Check Information and datatypes all column

In [None]:
data.info()

In [None]:
data.shape

### Change Date to Day,Month and Year

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data.drop(['Date'],axis=1,inplace=True)
data.head()

In [None]:
data["Hour"] = [i.split(":")[0] for i in data['Time']]
data["Minute"] = [i.split(":")[1] for i in data['Time']]
data.drop(['Time'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
data['Sub_metering_1'] = data['Sub_metering_1'].str.replace('?','0')
data['Sub_metering_2'] = data['Sub_metering_2'].str.replace('?','0')
data['Global_reactive_power'] = data['Global_reactive_power'].str.replace('?','0')
data['Global_active_power'] = data['Global_active_power'].str.replace('?','0')
data['Voltage'] = data['Voltage'].str.replace('?','0')
data['Global_intensity'] = data['Global_intensity'].str.replace('?','0')

### Change datatype of Day,Month,Year,Sub_metering_1 and Sub_metering_2 to Integer.

In [None]:
list = ['Day','Month','Year','Sub_metering_1','Sub_metering_2','Hour','Minute']
for i in list:
    data[i] = data[i].apply(np.int64)

### Change datatype of Global_active_power,Global_reactive_power,Voltage,Global_intensity to Float.

In [None]:
list = ['Global_active_power','Global_reactive_power','Voltage','Global_intensity']
for i in list:
    data[i] = data[i].apply(np.float64)

### Check datatypes

In [None]:
data.dtypes

### Check Null values

In [None]:
data.isnull().sum()

### Fill null values with mean

In [None]:
data['Sub_metering_3'] = data['Sub_metering_3'].fillna(data['Sub_metering_3'].mean())

In [None]:
data["Total_Metering"] = data['Sub_metering_1'] + data['Sub_metering_2'] + data['Sub_metering_3']

In [None]:
data.drop(['Sub_metering_1'],axis=1,inplace=True)
data.drop(['Sub_metering_2'],axis=1,inplace=True)
data.drop(['Sub_metering_3'],axis=1,inplace=True)

### Check Skewness

In [None]:
data.skew()

### Check Kurtosis

In [None]:
data.kurt()

### Numerical Features

In [None]:
num_feat = [num for num in data.columns if data[num].dtypes !='O']
print('We have {} numerical features are {}'.format(len(num_feat),num_feat))

### Check Inter Quartile Range(IQR)

In [None]:
for col in num_feat:
    Q1 = data[col].quantile(.25)
    Q3 = data[col].quantile(.75)
    IQR = Q3 - Q1
    print('IQR of %s : %d' %(col,IQR))

### Univariate Analysis

In [None]:
fig = plt.figure(figsize = (20,15))
plt.suptitle('Univariate Analysis of Numerical Features',fontsize=20,fontweight='bold',y=1.)
for i in range(0,len(num_feat)):
    plt.subplot(5,3,i+1)
    sns.kdeplot(x=data[num_feat[i]],shade='True',color='r')
    plt.xlabel(num_feat[i])
    plt.tight_layout()

## Multivariate Analysis

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(data.corr(), cmap="CMRmap", annot=True)
plt.show()

In [None]:
fig = plt.figure(figsize=(15, 20))
plt.suptitle('Relation of Numerical Features with Total Metering',fontsize=20,fontweight='bold',y=1.)
for i,col in enumerate(num_feat):
    ax = plt.subplot(6, 2, i+1)
    sns.scatterplot(data=data ,x='Total_Metering', y=col, color='b')
    plt.tight_layout()

In [None]:
plt.figure(figsize=(20,25))
for i,col in enumerate(num_feat):
    plt.subplot(5,5,i+1)
    sns.boxplot(x=data[col])

In [None]:
def getIQR(df,col,condition):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    IQR = q3 - q1 
    lower_fence = q1 - 1.5*IQR 
    upper_fence = q3 + 1.5*IQR
    if condition == 'Drop':
        df.drop(df.loc[df[col]<lower_fence].index,inplace=True)
        df.drop(df.loc[df[col]>upper_fence].index,inplace=True)
    elif condition == 'Fill':
        df[col] = np.where(df[col] < lower_fence, lower_fence, df[col])
        df[col] = np.where(df[col] > upper_fence, upper_fence, df[col])

In [None]:
lst = ['Global_active_power', 'Global_reactive_power', 'Voltage','Global_intensity', 'Total_Metering']
for i in lst:
    getIQR(data,i,'Fill')

In [None]:
plt.figure(figsize=(22,18))
for i,col in enumerate(data.columns):
    plt.subplot(5,5,i+1)
    sns.boxplot(data[col])

In [None]:
data.info()

In [None]:
import pymongo 
client = pymongo.MongoClient("mongodb+srv://Rushi:Rushi9867@cluster0.ej1dh6s.mongodb.net/?retryWrites=true&w=majority")
db = client.test
db

In [None]:
#db1 = client['Household_Power_Consumption']
#coll = db1['Household_Power_data']
#data_db = data.to_dict("records")
#coll.insert_many(data_db)

In [None]:
data.to_csv('Household_Cleaned.csv',index=False)

In [77]:
df = pd.read_csv("Household_Cleaned.csv")
df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Day,Month,Year,Hour,Minute,Total_Metering
0,3.426,0.418,234.84,14.2,16,12,2006,17,24,18.0
1,3.426,0.436,233.63,14.2,16,12,2006,17,25,17.0
2,3.426,0.465,233.29,14.2,16,12,2006,17,26,19.0
3,3.426,0.465,233.74,14.2,16,12,2006,17,27,18.0
4,3.426,0.465,235.68,14.2,16,12,2006,17,28,18.0


In [78]:
df_copy = df.sample(50000)

In [79]:
X = df_copy.drop('Total_Metering',axis=1)
y = df_copy['Total_Metering']

In [80]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [81]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()

In [82]:
scaler

In [84]:
X_train = scaler.fit_transform(X_train)

In [85]:
X_train

array([[-0.51993812, -1.10715817,  0.46249546, ...,  1.00652217,
         1.6542884 , -1.4123785 ],
       [ 0.28088857, -0.08749619, -0.73075582, ...,  1.00652217,
        -0.36351425, -0.08577228],
       [ 0.34865082, -1.10715817,  0.98703523, ...,  1.00652217,
         1.6542884 , -0.37416494],
       ...,
       [ 2.43490701, -0.351853  , -0.58044911, ..., -0.84665138,
        -0.50764301,  1.356191  ],
       [-0.90187085,  0.34680429,  0.46863043, ...,  1.00652217,
         0.2130008 , -1.00862878],
       [-0.82384158,  1.29093575,  0.15881454, ...,  1.00652217,
        -1.08415805, -0.37416494]])

In [86]:
X_test = scaler.transform(X_test)
X_test

array([[-0.84026879, -0.31408774,  0.36126849, ...,  1.00652217,
        -1.22828681,  1.06779835],
       [-0.90803105,  0.25239114,  0.90421316, ...,  1.00652217,
         0.9336446 , -0.89327172],
       [ 0.17411167, -1.10715817, -2.48228921, ..., -0.84665138,
        -0.50764301,  0.66404863],
       ...,
       [ 0.49444235, -0.44626614, -0.78597054, ...,  1.00652217,
        -0.36351425,  0.20262038],
       [ 0.34249062, -1.10715817,  0.22323171, ...,  1.00652217,
        -0.36351425, -0.489522  ],
       [ 0.17616508, -1.10715817,  0.22629919, ...,  1.00652217,
        -0.94002929, -1.23934291]])

### Linear Regression

In [87]:
from sklearn.linear_model import LinearRegression

In [88]:
regression = LinearRegression()
regression

In [89]:
regression.fit(X_train,y_train)

In [90]:
print(regression.coef_)

[ 2.97877590e+01  2.62381410e-01 -9.85028476e-01 -2.05555127e+01
  8.70588571e-02 -9.31048754e-03  4.32419307e-01 -8.75254456e-01
 -6.95645284e-02]


In [91]:
print(regression.intercept_)

8.132099706063933


In [92]:
reg_pred = regression.predict(X_test)
reg_pred

array([ 1.64648077, -1.28792444, 11.71477049, ..., 13.57174309,
       12.77074254, 11.42041172])

In [93]:
## residuals 
residuals = y_test - reg_pred
residuals

778224    -1.646481
892850     3.287924
245717     6.285230
691881    -1.401305
607134     6.881680
            ...    
71132    -21.156126
196609    -2.349745
850569     5.428257
683517     6.229257
781184     7.579588
Name: Total_Metering, Length: 16500, dtype: float64

In [102]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.metrics import r2_score
print("MSE: ",mean_squared_error(y_test,reg_pred))
print("MAE: ",mean_absolute_error(y_test,reg_pred)) 
print(np.sqrt(mean_squared_error(y_test,reg_pred)))
score = r2_score(y_test,reg_pred)
print("R Square: ",score)
print("Adjusted R Square: ",1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))

MSE:  41.415103388187035
MAE:  4.311248975524961
6.4354567350101135
R Square:  0.6837316200284765
Adjusted R Square:  0.683559005388104


### Ridge

In [103]:
from sklearn.linear_model import Ridge
ridge=Ridge()

In [104]:
ridge.fit(X_train,y_train)

In [105]:
ridge.intercept_

8.132099706063933

In [106]:
ridge.coef_

array([ 2.90039893e+01,  2.46880478e-01, -9.77070077e-01, -1.97649118e+01,
        8.73392667e-02, -8.73546383e-03,  4.32502186e-01, -8.75332524e-01,
       -6.95857705e-02])

In [107]:
rid_pred = ridge.predict(X_test)
rid_pred

array([ 1.63424548, -1.29023614, 11.71735175, ..., 13.60141859,
       12.74101789, 11.40507228])

In [108]:
## residuals 
residuals1 = y_test - rid_pred
residuals1

778224    -1.634245
892850     3.290236
245717     6.282648
691881    -1.377240
607134     6.881903
            ...    
71132    -21.113215
196609    -2.390423
850569     5.398581
683517     6.258982
781184     7.594928
Name: Total_Metering, Length: 16500, dtype: float64

In [109]:
print("MSE: ",mean_squared_error(y_test,rid_pred))
print("MAE: ",mean_absolute_error(y_test,rid_pred)) 
print(np.sqrt(mean_squared_error(y_test,rid_pred)))
score = r2_score(y_test,rid_pred)
print("R Square: ",score)
print("Adjusted R Square: ",1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))

MSE:  41.420863610584675
MAE:  4.310929512709662
6.435904257412837
R Square:  0.683687631819902
Adjusted R Square:  0.6835149931714107


### Lasso

In [110]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(X_train,y_train)

In [111]:
las_pred = lasso.predict(X_test)
las_pred 

array([ 1.06519017,  0.4952895 ,  9.59643047, ..., 12.29050636,
       11.01254729,  9.61370019])

In [112]:
## residuals 
residuals2 = y_test - las_pred
residuals2

778224    -1.065190
892850     1.504710
245717     8.403570
691881    -1.032294
607134     8.332847
            ...    
71132    -18.459083
196609    -3.172096
850569     6.709494
683517     7.987453
781184     9.386300
Name: Total_Metering, Length: 16500, dtype: float64

In [113]:
print("MSE: ",mean_squared_error(y_test,las_pred))
print("MAE: ",mean_absolute_error(y_test,las_pred)) 
print(np.sqrt(mean_squared_error(y_test,las_pred)))
score = r2_score(y_test,las_pred)
print("R Square: ",score)
print("Adjusted R Square: ",1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))

MSE:  44.41111054034701
MAE:  4.5551760150077225
6.664166154917433
R Square:  0.6608524708563654
Adjusted R Square:  0.6606673691121392


### ElasticNet

In [114]:
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(random_state=0)
elastic.fit(X_train,y_train)

In [115]:
ela_pred = elastic.predict(X_test)
ela_pred 

array([ 2.02652354,  1.2804562 , 10.90876596, ..., 12.15428744,
       10.1466105 ,  9.05445012])

In [116]:
## residuals 
residuals3 = y_test - ela_pred
residuals3

778224    -2.026524
892850     0.719544
245717     7.091234
691881    -1.399668
607134     8.698616
            ...    
71132    -15.858037
196609    -4.819348
850569     6.845713
683517     8.853390
781184     9.945550
Name: Total_Metering, Length: 16500, dtype: float64

In [117]:
print("MSE: ",mean_squared_error(y_test,ela_pred))
print("MAE: ",mean_absolute_error(y_test,ela_pred)) 
print("RMSE",np.sqrt(mean_squared_error(y_test,ela_pred)))
score = r2_score(y_test,ela_pred)
print("R Square: ",score)
print("Adjusted R Square: ",1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))

MSE:  48.4682403758277
MAE:  5.050654980436221
6.961913557049362
R Square:  0.6298700085315798
Adjusted R Square:  0.6296679970141017


### SVR

In [118]:
from sklearn.svm import SVR 
model=SVR()

In [119]:
model.fit(X_train,y_train)

In [121]:
model.score(X_train,y_train)

0.7579120559074217

In [125]:
svr_pred = model.predict(X_test)

In [126]:
print("MSE: ",mean_squared_error(y_test,svr_pred))
print("MAE: ",mean_absolute_error(y_test,svr_pred)) 
print("RMSE",np.sqrt(mean_squared_error(y_test,svr_pred)))
score = r2_score(y_test,svr_pred)
print("R Square: ",score)
print("Adjusted R Square: ",1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))

MSE:  136.8098381546221
MAE:  10.379238683812233
RMSE 11.69657377844564
R Square:  -0.04475474736277718
Adjusted R Square:  -0.0453249591715259


In [127]:
params = {'kernel':['linear','poly','sigmoid','rbf']}

In [129]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator = model,param_grid = params,cv=10,n_jobs = -1)

In [130]:
grid.fit(X_train,y_train)

In [None]:
grid.best_score_

In [None]:
new_svr = grid.best_params_
new_svr

In [None]:
# https://forms.gle/2PJSSCmL9S865nwe8