# Import required libraries & headbrain dataset

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
from sklearn import neighbors

from math import sqrt
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.externals import joblib

In [None]:
df_headbrain=pd.read_csv("headbrain.csv")

# Exploratory Data Analysis

In [None]:
df_headbrain.describe()

In [None]:
# Check null values using info() function or isnull() or notnull() method.
df_headbrain.info()

In [None]:
# Check datatypes of all columns values
df_headbrain.dtypes

In [None]:
# Checking for skewness; skewness not available in below output
df_headbrain.skew()

In [None]:
# draw pairplot & check the outcome to decide which ML Algorithm to apply
sns.pairplot(df_headbrain)
plt.show()

In [None]:
# Check for the outliers,apply zscore

from scipy.stats import zscore
z_score=abs(zscore(df_headbrain))
print(df_headbrain.shape)
df_headbrain_final=df_headbrain.loc[(z_score<3).all(axis=1)]
print(df_headbrain_final.shape)

In [None]:
# Separating target & input variables
import numpy as np
df_x=df_headbrain_final.drop(columns=["BrainWeight"])
y=df_headbrain_final[["BrainWeight"]]

In [None]:
# scalling the input variable
# linear regression algorith requires all feature to be on common scale

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(df_x)
x=pd.DataFrame(x,columns=df_x.columns)

In [None]:
# lets again check for skewness
x.skew()

# Apply train_test_split method using different linear Regression techniques

In [None]:
# Split data using train_test_split method

x_train, x_test, y_train, y_test = train_test_split(x, y,random_state = 45,test_size=0.20)

In [None]:
# Use for loop to iterate rstate to get the r2score.

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model

max_rscore=0
for r_state in range(42,101):
    x_train, x_test, y_train, y_test = train_test_split(x, y,random_state = r_state,test_size=0.20)
    regrn = linear_model.LinearRegression()
    regrn.fit(x_train,y_train)
    y_pred = regrn.predict(x_test)
    r2_scr=r2_score(y_test,y_pred)
    if r2_scr>max_rscore:
        max_rscore=r2_scr
        final_r_state=r_state
print("max r2 score corresponds to",final_r_state," is ",max_rscore)   

In [None]:
from sklearn.svm import SVR
kernellist=['linear','poly','rbf']
for i in kernellist:
    sv=SVR(kernel=i)
    sv.fit(x_train,y_train)
    print(sv.score(x_train,y_train))

In [None]:
# Check for which K value RMSE is lowest; here K=5 gives RMSE =71.27

rmse_val = []      # to store rmse values for different k
for K in range(10):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)
    model.fit(x_train, y_train)  # fit the model
    pred=model.predict(x_test)   # make prediction on test set
    error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
    rmse_val.append(error)       # store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

In [None]:
# import KNeighborsClassifier model 

from sklearn.neighbors import KNeighborsClassifier as KNN
knn = KNN(n_neighbors = 5) 
  
# train model 
knn.fit(x_train, y_train) 

# Save the Model using pickle or joblib library

In [None]:
from sklearn.externals import joblib

joblib_file = "joblib_regrn.pkl"             # Save to file in the current working directory
joblib.dump(regrn, joblib_file)
joblib_regrn = joblib.load(joblib_file)      # Load from file
score = joblib_regrn.score(x_test, y_test)   # Calculate the accuracy and predictions
print("Test score for linear regression: {0:.2f} %".format(100 * score))
