In [None]:
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import warnings
warnings.filterwarnings('ignore')

Loading File to the model

In [None]:
df=pd.read_csv('Abalone.csv')
df

EDA Process

In [None]:
df.sample(5)

In [None]:
df.tail(3)

In [None]:
df.dtypes

In [None]:
df=pd.DataFrame(data=df)
df

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull())

In [None]:
sns.distplot(df['Diameter'])

In [None]:
sns.distplot(df['Shell weight'])

In [None]:
df.columns

In [None]:
df.iloc[:,0]

In [None]:
lencode=LabelEncoder()
df['Sex']=lencode.fit_transform(df['Sex'])
df

In [None]:
for i in df.columns:
    sns.distplot(df[i])

In [None]:
sns.distplot(df['Height'])

In [None]:
sns.histplot(df['Height'],bins=20)

In [None]:
sns.histplot(df['Rings'],bins=20)

In [None]:
sns.kdeplot(df['Whole weight'])

In [None]:
sns.regplot(x='Shell weight',y='Height',data=df)

In [None]:
sns.regplot(x='Rings',y='Diameter',data=df)

In [None]:
sns.lmplot(x='Height',y='Diameter',data=df)

In [None]:
sns.pairplot(df)

In [None]:
df.describe()

Observations
Mean of height column is greater than median,hence it is right skewed.Similar with whole weight/shucked weight.
Standard deviation is high in whole weight,shucked weight it means data spread is high.
high gap between 75th percentile and max is present in rings.so few outliers are present.


In [None]:
df.skew()

In [None]:
import numpy as np
#df['Height']=np.Log(df['Height'])
df['Height']=np.sqrt(df['Height'])
df.skew()

In [None]:
df=np.sqrt(df)
df.skew()

In [None]:
sns.histplot(df['Height'],bins=20)

In [None]:
#detecting outliers
from scipy.stats import zscore
dfzscore=np.abs(zscore(df))
print(np.where(dfzscore>3))

In [None]:
dfnew=df[(dfzscore<3).all(axis=1)]
print('dfnew.shape=',dfnew.shape)

df=dfnew

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr(),cmap="Blues",annot=True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
x=df.drop('Rings',axis=1)
x

In [None]:
x.shape

In [None]:
x.shape[1]

In [None]:
y=df.iloc[:,-1]
y

In [None]:
def vif_calc():
    vif=pd.DataFrame()
    vif["VIF Factor"]=[variance_inflation_factor(x.values,i) for i in range(x.shape[1])]
    vif['features']=x.columns
    print(vif)

In [None]:
vif_calc()

Data Cleansing

In [None]:
df.drop(['Length','Shucked weight'],axis=1,inplace=True)
df

In [None]:
x=df.iloc[:,:-1]
x

In [None]:
x.shape

In [None]:
vif_calc()

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
y.shape

In [None]:
scale=StandardScaler()
x=scale.fit_transform(x)
x

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=45)

In [None]:
xtrain.shape

In [None]:
ytrain.shape

In [None]:
xtest.shape

In [None]:
ytest.shape

In [None]:
lr=LinearRegression()
lr.fit(xtrain,ytrain)
lr.coef_

In [None]:
features=['Gender','Diameter','Height','Whole weight','Viscera weight','Shell weight']
list(zip(lr.coef_,features))

In [None]:
cfdf=pd.DataFrame(data=lr.coef_,index=features)
cfdf

In [None]:
lr.intercept_

In [None]:
lr.score(xtrain,ytrain)

In [None]:
pred=lr.predict(xtest)
pred

In [None]:
print("Predicted values",pred)

In [None]:
preddf=pd.DataFrame(data=pred)
preddf

In [None]:
lr.score(xtest,ytest)

In [None]:
print('Mean absolute error::',mean_absolute_error(ytest,pred))
print('Mean squared error::',mean_squared_error(ytest,pred))
print('Root mean square::',np.sqrt(mean_squared_error(ytest,pred)))

In [None]:
print('R2 score::',r2_score(ytest,pred))

In [None]:
t=np.array([2,0.365,0.095,0.5140,0.1010,0.1500])

In [None]:
t

In [None]:
t.shape

In [None]:
t=t.reshape(1,-1)

In [None]:
lr.predict(t)

# Model Training


Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.svm import SVR

In [None]:
dtr=DecisionTreeRegressor(criterion='mse')
dtr.fit(xtrain,ytrain)
print('dtc score:',dtr.score(xtrain,ytrain))

dtrpredict=dtr.predict(xtest)
print('dtc r2_score:',r2_score(ytest,dtrpredict))

print('Mean squared error of dtr=',mean_squared_error(ytest,dtrpredict))
print('Root mean square of dtc=',np.sqrt(mean_squared_error(ytest,dtrpredict)))

In [None]:
knr=KNeighborsRegressor
knr.fit(xtrain,ytrain)
print('dtc score:',knr.score(xtrain,ytrain))

dtrpredict=dtr.predict(xtest)
print('dtc r2_score:',r2_score(ytest,knrpredict))

print('Mean squared error of dtc=',mean_squared_error(ytest,knrpredict))
print('Root mean square of dtc=',np.sqrt(mean_squared_error(ytest,knrpredict)))

In [None]:
svr1=SVR()
svr1.fit(xtrain,ytrain)
print('dtc score:',svr1.score(xtrain,ytrain))

svr1predict=svr1.predict(xtest)
print('dtc r2_score:',r2_score(ytest,svr1predict))

print('Mean squared error of dtc=',mean_squared_error(ytest,svr1predict))
print('Root mean square of dtc=',np.sqrt(mean_squared_error(ytest,svr1predict)))

In [None]:
from sklearn.linear_model import SGDRegressor

sgd=SGDRegressor()
sgd.fit(xtrain,ytrain)
print('dtc score:',sgd.score(xtrain,ytrain))

sgd1predict=sgd.predict(xtest)
print('dtc r2_score:',r2_score(ytest,sgd1predict))

print('Mean squared error of dtc=',mean_squared_error(ytest,sgd1predict))
print('Root mean square of dtc=',np.sqrt(mean_squared_error(ytest,sgd1predict)))

SAVING THE BEST MODEL

In [None]:
import joblib

joblib.dump(dtr,"dtrmodel.obj")
dtrloadmodel=joblib.load("dtrmodel.obj")
dtrloadmodel.predict(xtest)