## Linear Regression with Scikit Learn 


![](https://i.imgur.com/1EzyZvj.png)

In [None]:
# importing required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style("darkgrid")
matplotlib.rcParams["font.size"] = 14
matplotlib.rcParams["figure.figsize"] = (10,6)
matplotlib.rcParams["figure.facecolor"] = "#00000000"

In [None]:
# Loading the data from the file insurance.csv into Pandas Dataframe
medical_df = pd.read_csv("insurance.csv")
medical_df

In [None]:
#Analyzing the data
medical_df.info()
# does not contain any null values

In [None]:
medical_df.describe()

# Correlation between the columns

In [None]:
# correlation between the columns
medical_df.corr()
# shows the correlation only between the numeric columns but not the categorical cloumns
# change the categorical data into 1s and 0s using one-hot encoding




In [None]:
sns.heatmap(medical_df.corr(),cmap ="Reds",annot=True)
plt.title("Correlation Matrix");

Input  data and Target data coulmns

In [None]:
# split the  input data into numeric columns and categorical columns
input_cols = medical_df.columns[0:6]
inputs_df=  medical_df[input_cols].copy()
inputs_df

In [None]:
target_cols = medical_df.columns[6:]
target_df = medical_df[target_cols]
target_df

Splitting the input data columns into numeric and categorical data columns

In [None]:
numeric_cols = inputs_df.select_dtypes(include = ["int64","float64"]).columns.tolist()
numeric_cols

In [None]:
#cat_cols=inputs_df.select_dtypes(include=["object"]).columns.tolist()
cat_cols = ["region"]

print(cat_cols)
#smoker column
smokervalue = {"no": 0, "yes":1}
smoker_numeric = inputs_df.smoker.map(smokervalue )

print()

#sex column
sex_values={ "female":1,"male":0}
sex_numeric = inputs_df.sex.map(sex_values)







In [None]:
#encoding the categorical columns
inputs_df[cat_cols].nunique()


Encoding the categorical columns into 1s and 0s using One-Hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False,handle_unknown="ignore")
encoder.fit(inputs_df[cat_cols])
encoded_cols = list(encoder.get_feature_names(cat_cols))
inputs_df[encoded_cols] = encoder.transform(inputs_df[cat_cols])
inputs_df["sex_numeric"]=sex_numeric
inputs_df["smoker_numeric"]= smoker_numeric
encoded_cols = ['region_northeast','region_northwest','region_southeast','region_southwest','sex_numeric','smoker_numeric']
print(encoded_cols)




In [None]:
# scaling the numeric columns to range(0,1)
inputs_df[numeric_cols].describe().loc[['min', 'max']]

Scaling the numeric column data into the range(0,1) so that different ranges of numeric data cannot interrupt the output

In [None]:
# using MinMaxScaler from sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(inputs_df[numeric_cols])
inputs_df[numeric_cols] = scaler.transform(inputs_df[numeric_cols])
inputs_df


complete input data has been set to the range(0,1)

## Splitting the input and target data into test and train sets

In [None]:
#training and test set
from sklearn.model_selection import train_test_split
train_inputs, test_inputs,train_targets,test_targets = train_test_split(inputs_df[numeric_cols + encoded_cols],target_df, test_size=0.25, random_state=42)


In [None]:
train_inputs 

In [None]:
train_targets

generate and train the model

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_inputs,train_targets)


In [None]:
predictions = model.predict(train_inputs)
predictions[0:10]

In [None]:
train_targets[0:10]

computing the loss using ROOT MEAN SQUARED ERROR method

In [None]:
# Root Mean Squared Error
def rmse(target_data,predicted_data):
    return np.sqrt(np.mean(np.square(target_data-predicted_data)))

In [None]:
loss = rmse(train_targets,predictions)
print(loss)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
weights_df = pd.DataFrame({
    "features" : np.append(train_inputs.columns,1),
    "weights":np.append(model.coef_,model.intercept_)
})
weights_df

predicting using test set

In [None]:
test_predictions = model.predict(test_inputs)
test_predictions[0:10]

In [None]:
loss1 = rmse(test_targets,test_predictions)
loss1

In [None]:
import joblib

In [None]:
insurance_predictor = {
    'model': model,
    
    'scaler': scaler,
    'encoder': encoder,
    'input_cols': input_cols,
    'target_cols': target_cols,
    'numeric_cols': numeric_cols,
    'cat_cols': cat_cols,
    'encoded_cols': encoded_cols
}

In [None]:
joblib.dump(insurance_predictor,"insurance_predictor.joblib")

predict using new data

In [None]:

new_customers = [[28,30,2,1,0,0,1,0,0,0,0]]
scaler.transform([[28,30,2]])




In [None]:
model.predict([[0.2173913 , 0.37772397, 0.4,1,0,0,0,1,0]])


## Can use the model by importing  joblib  without retraining 

In [None]:
import joblib

In [None]:
new = joblib.load("insurance_predictor.joblib")

In [None]:
k=new["model"]

In [None]:
k.predict([[0.2173913 , 0.37772397, 0.4,1,0,0,0,1,0]])