In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report



In [2]:
df=pd.read_csv("abalone.csv")
## Viewing first 4 rows
df.head(4)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10


In [3]:
## Variable selection or feature selection
y=df['Sex']
del df['Sex']
x=df

In [4]:
## splitting the dataset into train & test set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=False)
print(x_train.shape)
print(y_test.shape)


(3341, 8)
(836,)


In [5]:
## Building the model
log_reg=LogisticRegression()
log_model=log_reg.fit(x_train,y_train)
print(log_model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [6]:
## Prediction with logistic regression model
log_prd=log_model.predict(x_test)
print(accuracy_score(y_test,log_prd))
print(confusion_matrix(y_test,log_prd))
print(classification_report(y_test,log_prd))

0.5574162679425837
[[ 64  46 139]
 [ 12 237  42]
 [ 52  79 165]]
             precision    recall  f1-score   support

          F       0.50      0.26      0.34       249
          I       0.65      0.81      0.73       291
          M       0.48      0.56      0.51       296

avg / total       0.55      0.56      0.54       836



## Grid Search Cross Validation

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
params={"C":[0.001,0.01,0.1,1,10,100,1000,10000],"penalty":('l1','l2'),"max_iter":[50,100,150,200]}

In [15]:
grid_log=GridSearchCV(log_reg,params,cv=10)

In [16]:
grid_log_model=grid_log.fit(x_train,y_train)

In [11]:
grid_log_model.best_params_

{'C': 100, 'max_iter': 150, 'penalty': 'l1'}

In [13]:
grid_log_prediction=grid_log_model.predict(x_test)

In [14]:
print(accuracy_score(y_test,grid_log_prediction))
print(confusion_matrix(y_test,grid_log_prediction))
print(classification_report(y_test,grid_log_prediction))

0.5550239234449761
[[ 93  45 111]
 [ 20 238  33]
 [ 86  77 133]]
             precision    recall  f1-score   support

          F       0.47      0.37      0.42       249
          I       0.66      0.82      0.73       291
          M       0.48      0.45      0.46       296

avg / total       0.54      0.56      0.54       836

