<a href="https://colab.research.google.com/github/Nongbon/Stat-Learning-for-Data-Sci/blob/main/Multinomial_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
!wget https://raw.githubusercontent.com/Nongbon/Stat-Learning-for-Data-Sci/main/Data/abalone.csv

--2021-06-14 18:42:53--  https://raw.githubusercontent.com/Nongbon/Stat-Learning-for-Data-Sci/main/Data/abalone.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191928 (187K) [text/plain]
Saving to: ‘abalone.csv’


2021-06-14 18:42:53 (6.03 MB/s) - ‘abalone.csv’ saved [191928/191928]



In [6]:
abalone_df = pd.read_csv('abalone.csv',  delimiter=',') 
abalone_df.head()

Unnamed: 0,SEX,LENGTH,DIAM,HEIGHT,WHOLE,SHUCK,VISCERA,SHELL,RINGS
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
#Create training and test datasets
#CLASS needs to be dropped
X = abalone_df.drop(['SEX'], axis=1) 
y = abalone_df['SEX']

print(list(X.columns.values)) 

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

['LENGTH', 'DIAM', 'HEIGHT', 'WHOLE', 'SHUCK', 'VISCERA', 'SHELL', 'RINGS']
(3341, 8)
(836, 8)
(3341,)
(836,)


In [None]:
model1 = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg').fit(X_train, y_train)
preds = model1.predict(X_test)

#print the tunable parameters (They were not tuned in this example, everything kept as default)
params = model1.get_params()
print(params)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'none', 'random_state': 0, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [None]:
#Print model parameters
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)

Intercept: 
 [-1.78721895  1.04125279  0.74596616]
Coefficients: 
 [[-6.14593444  6.60730526  3.92919881  1.87257551 -2.6689075   5.98939578
  -1.506308    0.06142572]
 [12.17943543 -6.83043884 -5.45062605 -4.22198844  2.79759309 -8.49325795
   3.09534916 -0.12236149]
 [-6.03350099  0.22313358  1.52142723  2.34941293 -0.12868559  2.50386217
  -1.58904116  0.06093578]]


In [None]:
#Calculate odds ratio estimates
import numpy as np
np.exp(model1.coef_)

array([[2.14217323e-03, 7.40484917e+02, 5.08662080e+01, 6.50502858e+00,
        6.93279247e-02, 3.99173348e+02, 2.21727086e-01, 1.06335151e+00],
       [1.94742885e+05, 1.08038390e-03, 4.29361584e-03, 1.46694462e-02,
        1.64051135e+01, 2.04844798e-04, 2.20949518e+01, 8.84828450e-01],
       [2.39708709e-03, 1.24998753e+00, 4.57875549e+00, 1.04794158e+01,
        8.79250371e-01, 1.22296358e+01, 2.04121237e-01, 1.06283065e+00]])

In [None]:
#Create a confusion matrix
#y_test as first argument and the preds as second argument 
confusion_matrix(y_test, preds)

#transform confusion matrix into array
#the matrix is stored in a vaiable called confmtrx
confmtrx = np.array(confusion_matrix(y_test, preds))
#Create DataFrame from confmtrx array 
#rows for test: Male, Female, Infant designation as index 
#columns for preds: male, predicted_female, predicted_infant as column

pd.DataFrame(confmtrx, index=['Female','Infant', 'Male'],
columns=['predicted_Female', 'predicted_Infant', 'predicted_Male'])


Unnamed: 0,predicted_Female,predicted_Infant,predicted_Male
Female,88,44,138
Infant,15,210,35
Male,79,66,161


In [None]:
print('Accuracy Score:', metrics.accuracy_score(y_test, preds))  

#Create classification report
class_report=classification_report(y_test, preds)
print(class_report)

Accuracy Score: 0.5490430622009569
              precision    recall  f1-score   support

           F       0.48      0.33      0.39       270
           I       0.66      0.81      0.72       260
           M       0.48      0.53      0.50       306

    accuracy                           0.55       836
   macro avg       0.54      0.55      0.54       836
weighted avg       0.54      0.55      0.54       836



Assignment: Create Age variable by grouping abalones with less than 6 rings (<7.5 years old),
from 6 to 13 rings (7.5 to 14.5 years old) and more than 13 rings (>14.5 years old) indicating young, adult and old abalones respectively.
Construct Multinomial Logistic Regression to classify Age.

In [None]:
#Type your code here

def group(r):
    if r < 6: return 0
    elif r <= 13: return 1
    else: return 2


dummies = pd.DataFrame(pd.get_dummies(abalone_df.SEX, prefix='SEX'))
X = dummies.join(abalone_df).drop(['SEX', 'SEX_F', 'RINGS'], axis=1)
Y = abalone_df['RINGS'].apply(group).rename('YEARS')

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size = 0.20, random_state = 5)

In [None]:
model = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg').fit(X_train, Y_train)
Y_preds = model.predict(X_test)

In [None]:
conf = np.array(confusion_matrix(Y_test, Y_preds))
#Create DataFrame from confmtrx array 
#rows for test: Male, Female, Infant designation as index 
#columns for preds: male, predicted_female, predicted_infant as column

pd.DataFrame(conf, index=['YOUNG','ADULT', 'OLD'],
columns=['predicted_YOUNG','predicted_ADULT', 'predicted_OLD'])

Unnamed: 0,predicted_YOUNG,predicted_ADULT,predicted_OLD
YOUNG,24,16,0
ADULT,6,667,19
OLD,0,71,33


In [None]:
print('Accuracy Score:', metrics.accuracy_score(Y_test, Y_preds))

class_report=classification_report(Y_test, Y_preds)
print(class_report)

Accuracy Score: 0.8660287081339713
              precision    recall  f1-score   support

           0       0.80      0.60      0.69        40
           1       0.88      0.96      0.92       692
           2       0.63      0.32      0.42       104

    accuracy                           0.87       836
   macro avg       0.77      0.63      0.68       836
weighted avg       0.85      0.87      0.85       836



In [None]:
#Using stat models
logit_model=sm.MNLogit(Y_train,sm.add_constant(X_train))
logit_model
result=logit_model.fit()
stats=result.summary()
print(stats)

Optimization terminated successfully.
         Current function value: 0.298824
         Iterations 14
                          MNLogit Regression Results                          
Dep. Variable:                  YEARS   No. Observations:                 3341
Model:                        MNLogit   Df Residuals:                     3321
Method:                           MLE   Df Model:                           18
Date:                Mon, 08 Mar 2021   Pseudo R-squ.:                  0.4410
Time:                        05:36:41   Log-Likelihood:                -998.37
converged:                       True   LL-Null:                       -1786.1
Covariance Type:            nonrobust   LLR p-value:                     0.000
   YEARS=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.8884      1.475     -3.992      0.000      -8.780      -2.997
SEX_I          0.6324      0

References:
https://www.datasklr.com/logistic-regression/multinomial-logistic-regression