In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
oil=pd.read_csv('OilSample_test.csv', sep=';')

In [3]:
print(oil.shape)
print(list(oil.columns))

(420, 48)
['serialno', 'sampledate', 'evalcode', 'compartid', 'oiltypeid', 'oilgradeid', 'oilhours', 'machinehours', 'PQI', 'Fe', 'Cu', 'Cr', 'Pb', 'Sn', 'Ni', 'Al', 'Si', 'Na', 'K', 'Mo', 'B', 'Ba', 'Mg', 'Ca', 'Zn', 'P', 'Ag', 'Mn', 'V', 'Ti', 'Cd', 'BO3', 'PO4', 'H2O', 'F', 'V40', 'OXI', 'NIT', 'SUL', 'ISO6', 'ISO14', 'X6', 'X10', 'X14', 'X21', 'X25', 'X38', 'X70']


In [4]:
oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 48 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   serialno      420 non-null    int64  
 1   sampledate    420 non-null    object 
 2   evalcode      420 non-null    object 
 3   compartid     420 non-null    int64  
 4   oiltypeid     420 non-null    object 
 5   oilgradeid    420 non-null    object 
 6   oilhours      420 non-null    int64  
 7   machinehours  420 non-null    int64  
 8   PQI           420 non-null    int64  
 9   Fe            420 non-null    object 
 10  Cu            420 non-null    object 
 11  Cr            420 non-null    object 
 12  Pb            420 non-null    object 
 13  Sn            420 non-null    object 
 14  Ni            420 non-null    object 
 15  Al            420 non-null    object 
 16  Si            420 non-null    object 
 17  Na            420 non-null    object 
 18  K             420 non-null    

In [5]:
oil[['Al', 'Si', 'Na', 'K', 'Mo', 'B', 'Ba', 'Mg', 'Ca', 'Zn']]

Unnamed: 0,Al,Si,Na,K,Mo,B,Ba,Mg,Ca,Zn
0,<1,18,2,<1,<1,6,<1,16,3928,1002
1,1,11,70,2,<1,3,<1,18,3505,1018
2,<1,4,8,<1,51,1,<1,936,1329,1342
3,1,9,22,<1,<1,437,<1,1009,60,17
4,5,20,56,2,2,426,<1,1002,69,54
...,...,...,...,...,...,...,...,...,...,...
415,<1,7,2,<1,<1,430,<1,927,21,8
416,<1,6,1,<1,<1,19,<1,16,4,19
417,<1,7,1,<1,23,111,<1,117,2977,1042
418,<1,12,3,2,55,135,<1,374,1407,1032


In [6]:
oil['Sn'].unique() # only <1 in Sn column


array(['<1'], dtype=object)

In [7]:
oil.isnull().sum()

serialno          0
sampledate        0
evalcode          0
compartid         0
oiltypeid         0
oilgradeid        0
oilhours          0
machinehours      0
PQI               0
Fe                0
Cu                0
Cr                0
Pb                0
Sn                0
Ni                0
Al                0
Si                0
Na                0
K                 0
Mo                0
B                 0
Ba                0
Mg                0
Ca                0
Zn                0
P                 0
Ag                0
Mn                0
V                 0
Ti                0
Cd                0
BO3             420
PO4             420
H2O               0
F               352
V40               0
OXI               0
NIT               0
SUL               0
ISO6             68
ISO14            68
X6               68
X10              68
X14              68
X21              68
X25              68
X38              68
X70              68
dtype: int64

In [10]:
oil_new=oil.drop(columns=['BO3','PO4', 'Sn','F','sampledate'])
# BO3 and PO4 are null for all the rows, F column has way too small number of observations in the total sample

In [20]:
oilsample=oil_new.copy()

In [21]:
oilsample.isnull().sum()

serialno         0
evalcode         0
compartid        0
oiltypeid        0
oilgradeid       0
oilhours         0
machinehours     0
PQI              0
Fe               0
Cu               0
Cr               0
Pb               0
Ni               0
Al               0
Si               0
Na               0
K                0
Mo               0
B                0
Ba               0
Mg               0
Ca               0
Zn               0
P                0
Ag               0
Mn               0
V                0
Ti               0
Cd               0
H2O              0
V40              0
OXI              0
NIT              0
SUL              0
ISO6            68
ISO14           68
X6              68
X10             68
X14             68
X21             68
X25             68
X38             68
X70             68
dtype: int64

In [22]:
pd.set_option("display.max_columns", None)

In [23]:
oilsample[oilsample['X6'].isnull()]



Unnamed: 0,serialno,evalcode,compartid,oiltypeid,oilgradeid,oilhours,machinehours,PQI,Fe,Cu,Cr,Pb,Ni,Al,Si,Na,K,Mo,B,Ba,Mg,Ca,Zn,P,Ag,Mn,V,Ti,Cd,H2O,V40,OXI,NIT,SUL,ISO6,ISO14,X6,X10,X14,X21,X25,X38,X70
2,1,A,118,2-2,15W-40,314,10254,0,25,4,3,6,<1,<1,4,8,<1,51,1,<1,936,1329,1342,1143,<1,<1,<1,<1,<1,0.0,119.9,19,11,23,,,,,,,,,
9,1,A,118,2-2,15W-40,308,9940,3,22,4,2,7,<1,<1,2,14,2,49,1,<1,880,1269,1283,1080,<1,<1,<1,<1,<1,0.0,119.6,19,11,23,,,,,,,,,
19,1,A,118,2-2,15W-40,309,9632,0,21,4,3,7,<1,<1,3,9,<1,49,1,<1,877,1272,1294,1082,<1,<1,<1,<1,<1,0.0,114.0,18,10,23,,,,,,,,,
20,1,A,118,2-2,15W-40,400,9323,4,10,2,1,1,<1,<1,2,5,<1,51,1,<1,880,1202,1274,1106,<1,<1,<1,<1,<1,0.0,116.9,14,8,19,,,,,,,,,
21,1,A,118,2-2,15W-40,303,9226,3,24,4,4,7,<1,<1,<1,9,<1,50,3,<1,911,1304,1356,1080,<1,<1,<1,<1,<1,0.0,114.1,17,10,22,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,6,A,118,2-2,15W-40,303,1812,0,42,2,1,3,<1,<1,3,<1,<1,48,<1,<1,817,1072,1155,990,<1,<1,<1,<1,<1,0.0,95.3,12,7,18,,,,,,,,,
395,6,A,118,2-2,15W-40,423,1509,0,24,2,<1,3,<1,<1,2,<1,1,38,<1,<1,665,960,965,831,<1,<1,<1,<1,<1,0.0,108.2,12,6,17,,,,,,,,,
405,7,B,118,2-2,15W-40,388,477,6,51,4,<1,<1,<1,<1,12,3,<1,48,20,<1,741,1355,1233,1055,<1,<1,<1,<1,<1,0.0,98.5,12,7,16,,,,,,,,,
418,7,A,118,2-2,15W-40,89,89,2,10,11,<1,<1,<1,<1,12,3,2,55,135,<1,374,1407,1032,942,<1,<1,<1,<1,<1,0.0,100.6,12,5,16,,,,,,,,,


In [24]:
oilsample[['ISO6','ISO14','X6','X10','X14','X21','X25','X38','X70']] = oilsample[['ISO6','ISO14','X6','X10','X14','X21','X25','X38','X70']].replace({'-':''})
#We still have '-' values in these columns so we need to convert it to a number that doesnt exist in any of above columns
#then transform data type of all these columns to int

In [25]:
oilsample[['ISO6','ISO14','X6','X10','X14','X21','X25','X38','X70']]=oilsample[['ISO6','ISO14','X6','X10','X14','X21','X25','X38','X70']].replace('',np.nan).astype(float)


In [26]:
#replace nan value to mean value of each column
cat_vars=['H2O','ISO6','ISO14','X6','X10','X14','X21','X25','X38','X70']
for var in cat_vars:
    oilsample[var].fillna((oilsample[var].mean()), inplace=True)
    

In [27]:
oilsample[['Ag', 'Mn','V','Ti','Cd']] = oilsample[['Ag', 'Mn','V','Ti','Cd']].replace({'<1':'0.5'})
# there are <1 values in those columns, so replact with number 0.5. 


In [28]:
cols = ['Ag', 'Mn','V','Ti','Cd']
oilsample[cols] = oilsample[cols].applymap(np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  oilsample[cols] = oilsample[cols].applymap(np.float)


In [29]:
oilsample[['Fe', 'Cu','Cr','Pb','Ni','Al','Si','Na','K','Mo','B','Ba','Mg','Ca','Zn']] = oilsample[['Fe', 'Cu','Cr','Pb','Ni','Al','Si','Na','K','Mo','B','Ba','Mg','Ca','Zn']].replace({'<1':'0.5'})
# there are <1 values in those columns, so replact with number 0.5. 


In [30]:
cols = ['Fe', 'Cu','Cr','Pb','Ni','Al','Si','Na','K','Mo','B','Ba','Mg','Ca','Zn']
oilsample[cols] = oilsample[cols].applymap(np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  oilsample[cols] = oilsample[cols].applymap(np.float)


In [31]:
oilsample["serialno"] = oilsample["serialno"].astype('category')
oilsample["compartid"] = oilsample["compartid"].astype('category')
oilsample["oiltypeid"] = oilsample["oiltypeid"].astype('category')
oilsample["oilgradeid"] = oilsample["oilgradeid"].astype('category')


In [32]:
oilsample.isnull().sum()

serialno        0
evalcode        0
compartid       0
oiltypeid       0
oilgradeid      0
oilhours        0
machinehours    0
PQI             0
Fe              0
Cu              0
Cr              0
Pb              0
Ni              0
Al              0
Si              0
Na              0
K               0
Mo              0
B               0
Ba              0
Mg              0
Ca              0
Zn              0
P               0
Ag              0
Mn              0
V               0
Ti              0
Cd              0
H2O             0
V40             0
OXI             0
NIT             0
SUL             0
ISO6            0
ISO14           0
X6              0
X10             0
X14             0
X21             0
X25             0
X38             0
X70             0
dtype: int64

In [33]:
oilsample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 43 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   serialno      420 non-null    category
 1   evalcode      420 non-null    object  
 2   compartid     420 non-null    category
 3   oiltypeid     420 non-null    category
 4   oilgradeid    420 non-null    category
 5   oilhours      420 non-null    int64   
 6   machinehours  420 non-null    int64   
 7   PQI           420 non-null    int64   
 8   Fe            420 non-null    float64 
 9   Cu            420 non-null    float64 
 10  Cr            420 non-null    float64 
 11  Pb            420 non-null    float64 
 12  Ni            420 non-null    float64 
 13  Al            420 non-null    float64 
 14  Si            420 non-null    float64 
 15  Na            420 non-null    float64 
 16  K             420 non-null    float64 
 17  Mo            420 non-null    float64 
 18  B         

In [34]:

oilsample.describe()

Unnamed: 0,oilhours,machinehours,PQI,Fe,Cu,Cr,Pb,Ni,Al,Si,Na,K,Mo,B,Ba,Mg,Ca,Zn,P,Ag,Mn,V,Ti,Cd,H2O,V40,OXI,NIT,SUL,ISO6,ISO14,X6,X10,X14,X21,X25,X38,X70
count,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0,420.0
mean,659.42381,7604.064286,34.833333,43.159524,4.182143,0.779762,2.127381,0.896429,1.389286,6.991667,13.590476,1.063095,7.975,111.620238,0.55,280.016667,1049.185714,423.127381,954.195238,0.741667,0.640476,0.50119,0.50119,0.508333,0.01,207.42619,16.228571,5.07619,23.442857,20.37931,17.08046,18057.204023,6650.853448,3306.968391,1304.574713,318.916667,33.37069,1.988506
std,805.440369,2865.313394,85.730837,78.317523,18.446194,0.964004,5.610452,2.06502,7.276602,5.480214,41.197732,1.14579,16.748561,147.606251,0.288813,394.660687,1460.799262,517.322547,262.347362,1.524831,0.59475,0.024398,0.024398,0.12665,0.070761,113.262062,17.081027,1.605154,16.049716,3.341334,3.180244,11478.707821,7240.778323,5120.757503,2703.537482,779.976024,70.78933,6.418565
min,-8084.0,47.0,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,582.0,0.5,0.5,0.5,0.5,0.5,0.0,72.3,4.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,366.0,7671.0,1.0,12.0,0.5,0.5,0.5,0.5,0.5,3.0,1.0,0.5,0.5,4.0,0.5,0.5,12.0,7.0,779.75,0.5,0.5,0.5,0.5,0.5,0.0,100.575,6.0,4.0,16.0,20.37931,17.0,10115.5,2114.25,663.25,204.0,22.0,5.0,0.0
50%,509.0,8581.5,9.0,21.5,0.5,0.5,0.5,0.5,0.5,6.0,3.0,0.5,0.5,81.5,0.5,16.0,86.5,30.0,843.5,0.5,0.5,0.5,0.5,0.5,0.0,114.35,10.0,5.0,16.0,21.0,17.08046,18057.204023,5389.0,1848.0,476.0,116.0,16.0,1.0
75%,871.0,9559.0,25.0,41.0,3.0,0.5,1.0,0.5,1.0,9.0,8.0,1.0,0.5,103.0,0.5,789.0,1269.25,997.5,1006.25,0.5,0.5,0.5,0.5,0.5,0.0,325.25,14.0,5.0,19.0,22.0,18.25,22717.25,6661.5,3306.968391,1304.574713,318.916667,33.37069,1.988506
max,8317.0,11163.0,716.0,764.0,282.0,9.0,67.0,24.0,147.0,44.0,388.0,12.0,58.0,525.0,3.0,1140.0,4514.0,1462.0,1914.0,16.0,6.0,1.0,1.0,3.0,1.0,366.5,68.0,12.0,85.0,23.0,22.0,60372.0,39225.0,31127.0,17848.0,6452.0,581.0,62.0


In [35]:
#Recode evalcode to integer: LinearRegression does not take string

def Classification(evalcode):
    if evalcode == 'A':
        return 1
    elif evalcode == 'B':
        return 2
    elif evalcode == 'C':
        return 3
    elif evalcode == 'X':
        return 4

    
oilsample['ecode']=oilsample['evalcode'].apply(Classification)

In [36]:
oilsample.isnull().sum()

serialno        0
evalcode        0
compartid       0
oiltypeid       0
oilgradeid      0
oilhours        0
machinehours    0
PQI             0
Fe              0
Cu              0
Cr              0
Pb              0
Ni              0
Al              0
Si              0
Na              0
K               0
Mo              0
B               0
Ba              0
Mg              0
Ca              0
Zn              0
P               0
Ag              0
Mn              0
V               0
Ti              0
Cd              0
H2O             0
V40             0
OXI             0
NIT             0
SUL             0
ISO6            0
ISO14           0
X6              0
X10             0
X14             0
X21             0
X25             0
X38             0
X70             0
ecode           0
dtype: int64

In [37]:
oil_copy=oilsample.copy()

In [38]:
serialno = pd.get_dummies(oil_copy['serialno'],drop_first=True,prefix='serialno')
compartid = pd.get_dummies(oil_copy['compartid'],drop_first=True,prefix='compartid')
oiltypeid = pd.get_dummies(oil_copy['oiltypeid'],drop_first=True,prefix='oiltypeid')
oilgradeid = pd.get_dummies(oil_copy['oilgradeid'],drop_first=True,prefix='oilgradeid')

oil_copy2 = oil_copy
oil_copy2.drop(['serialno','compartid','oiltypeid','oilgradeid'],axis=1,inplace=True)
oil_copy2 = pd.concat([oil_copy2,serialno,compartid,oiltypeid,oilgradeid],axis=1)

In [39]:
oil_copy2

Unnamed: 0,evalcode,oilhours,machinehours,PQI,Fe,Cu,Cr,Pb,Ni,Al,Si,Na,K,Mo,B,Ba,Mg,Ca,Zn,P,Ag,Mn,V,Ti,Cd,H2O,V40,OXI,NIT,SUL,ISO6,ISO14,X6,X10,X14,X21,X25,X38,X70,ecode,serialno_2,serialno_3,serialno_4,serialno_5,serialno_6,serialno_7,serialno_8,compartid_118,compartid_146,compartid_172,compartid_317,compartid_326,compartid_364,compartid_422,compartid_423,compartid_424,compartid_425,compartid_432,compartid_433,oiltypeid_1-2,oiltypeid_1-3,oiltypeid_2-1,oiltypeid_2-2,oilgradeid_30,oilgradeid_68,oilgradeid_75W-90,oilgradeid_85W-140
0,B,167,10421,152,47.0,0.5,0.5,0.5,0.5,0.5,18.0,2.0,0.5,0.5,6.0,0.5,16.0,3928.0,1002.0,835,0.5,0.5,0.5,0.5,0.5,0.0,346.0,12,5,17,23.00000,21.00000,40781.000000,27795.000000,19905.000000,9614.000000,741.000000,219.00000,14.000000,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
1,C,789,10421,8,26.0,10.0,0.5,0.5,0.5,1.0,11.0,70.0,2.0,0.5,3.0,0.5,18.0,3505.0,1018.0,847,0.5,0.5,0.5,0.5,0.5,0.3,111.5,11,5,15,23.00000,22.00000,48072.000000,35001.000000,26354.000000,13472.000000,584.000000,112.00000,4.000000,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,A,314,10254,0,25.0,4.0,3.0,6.0,0.5,0.5,4.0,8.0,0.5,51.0,1.0,0.5,936.0,1329.0,1342.0,1143,0.5,0.5,0.5,0.5,0.5,0.0,119.9,19,11,23,20.37931,17.08046,18057.204023,6650.853448,3306.968391,1304.574713,318.916667,33.37069,1.988506,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,A,622,10254,6,24.0,0.5,0.5,0.5,0.5,1.0,9.0,22.0,0.5,0.5,437.0,0.5,1009.0,60.0,17.0,1668,0.5,0.5,0.5,0.5,0.5,0.0,94.8,59,4,61,20.00000,16.00000,7425.000000,1076.000000,349.000000,73.000000,1.000000,0.00000,0.000000,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,A,622,10254,62,552.0,18.0,7.0,0.5,5.0,5.0,20.0,56.0,2.0,2.0,426.0,0.5,1002.0,69.0,54.0,1644,0.5,4.0,0.5,0.5,0.5,0.0,93.6,58,4,60,23.00000,22.00000,54655.000000,34752.000000,22237.000000,7235.000000,30.000000,4.00000,0.000000,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,A,89,89,4,5.0,0.5,0.5,0.5,0.5,0.5,7.0,2.0,0.5,0.5,430.0,0.5,927.0,21.0,8.0,1508,0.5,0.5,0.5,0.5,0.5,0.0,95.5,53,4,63,21.00000,19.00000,14698.000000,5841.000000,3098.000000,1055.000000,72.000000,20.00000,1.000000,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
416,C,89,89,332,40.0,3.0,0.5,0.5,0.5,0.5,6.0,1.0,0.5,0.5,19.0,0.5,16.0,4.0,19.0,951,0.5,0.5,0.5,0.5,0.5,0.0,341.8,5,4,16,23.00000,22.00000,41310.000000,32063.000000,25758.000000,15704.000000,1696.000000,414.00000,21.000000,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
417,A,89,89,1,2.0,9.0,0.5,2.0,2.0,0.5,7.0,1.0,0.5,23.0,111.0,0.5,117.0,2977.0,1042.0,918,0.5,0.5,0.5,0.5,0.5,0.0,108.5,10,4,15,17.00000,15.00000,898.000000,296.000000,172.000000,75.000000,11.000000,2.00000,0.000000,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
418,A,89,89,2,10.0,11.0,0.5,0.5,0.5,0.5,12.0,3.0,2.0,55.0,135.0,0.5,374.0,1407.0,1032.0,942,0.5,0.5,0.5,0.5,0.5,0.0,100.6,12,5,16,20.37931,17.08046,18057.204023,6650.853448,3306.968391,1304.574713,318.916667,33.37069,1.988506,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [42]:
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

In [43]:
y = oil_copy2['ecode']
X = oil_copy2.drop(['ecode'],axis=1)

In [44]:
#Create training and test datasets
#evalcode was recoded into ecode to change from string to integer
#evalcode needs to be dropped
X = oil_copy2.drop(['ecode', 'evalcode'], axis=1) 
y = oil_copy2['ecode']

print(list(X.columns.values)) 

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.30, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.30, random_state = 5)


['oilhours', 'machinehours', 'PQI', 'Fe', 'Cu', 'Cr', 'Pb', 'Ni', 'Al', 'Si', 'Na', 'K', 'Mo', 'B', 'Ba', 'Mg', 'Ca', 'Zn', 'P', 'Ag', 'Mn', 'V', 'Ti', 'Cd', 'H2O', 'V40', 'OXI', 'NIT', 'SUL', 'ISO6', 'ISO14', 'X6', 'X10', 'X14', 'X21', 'X25', 'X38', 'X70', 'serialno_2', 'serialno_3', 'serialno_4', 'serialno_5', 'serialno_6', 'serialno_7', 'serialno_8', 'compartid_118', 'compartid_146', 'compartid_172', 'compartid_317', 'compartid_326', 'compartid_364', 'compartid_422', 'compartid_423', 'compartid_424', 'compartid_425', 'compartid_432', 'compartid_433', 'oiltypeid_1-2', 'oiltypeid_1-3', 'oiltypeid_2-1', 'oiltypeid_2-2', 'oilgradeid_30', 'oilgradeid_68', 'oilgradeid_75W-90', 'oilgradeid_85W-140']
(294, 65)
(126, 65)
(294,)
(126,)


In [45]:
model1 = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg').fit(X_train, y_train)
preds = model1.predict(X_test)

params = model1.get_params()
print(params)

#print the tunable parameters (They were not tuned in this example, everything kept as default)


{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'none', 'random_state': 0, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}




In [46]:
#Print model parameters
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)


Intercept: 
 [ 4.04933684e-02 -5.48599558e-02  1.44208382e-02 -5.42507852e-05]
Coefficients: 
 [[ 4.02451179e-03  4.29493063e-03 -4.21637944e-02 -5.33977614e-03
  -3.73757855e-02 -1.91584209e-01 -4.81127683e-01  9.18033887e-01
   6.85245638e-01 -8.24474688e-02 -9.20225749e-02 -3.30105634e-01
   1.78582073e-03  7.86777787e-03  7.22517705e-02  2.07493865e-02
  -3.03073396e-03  4.34625796e-03  1.38897958e-02 -1.24125907e-01
  -1.32058483e-01 -6.37864185e-02 -6.37864185e-02  2.16306597e-01
  -1.80469814e-01  4.56863930e-02  3.77206388e-01  9.45841844e-02
  -1.98931733e-01  7.03105019e-01 -4.15435040e-01 -2.68547827e-03
   5.23861682e-03 -4.55535456e-03  3.41178568e-03 -4.21512591e-03
  -1.00148828e-02 -3.87530371e-01 -7.64557519e-02 -2.50890377e-01
  -5.92344267e-01 -3.40275118e-01  1.32361150e-01 -8.50068743e-02
   8.56847567e-03  2.15919444e-02 -2.38208369e-02 -3.74950734e-01
   1.37557562e-01  3.26162731e-01 -2.86451560e-01 -1.54270210e-01
  -5.64688532e-02  1.25465365e-01  6.74984621e-

In [47]:
print('Accuracy Score:', metrics.accuracy_score(y_test, preds))  

#Create classification report
class_report=classification_report(y_test, preds)
print(class_report)



Accuracy Score: 0.7063492063492064
              precision    recall  f1-score   support

           1       0.73      0.92      0.81        59
           2       0.82      0.57      0.67        56
           3       0.33      0.30      0.32        10
           4       0.00      0.00      0.00         1

    accuracy                           0.71       126
   macro avg       0.47      0.45      0.45       126
weighted avg       0.73      0.71      0.70       126



In [56]:
result = model1.fit(X, y)




In [61]:
row = X.iloc[0:1, :]
# predict a multinomial probability distribution
yhat = model1.predict_proba(row)
# summarize the predicted probabilities
print('Predicted Probabilities: %s' % yhat[0])

Predicted Probabilities: [1.80170107e-01 7.82730480e-01 3.65879012e-02 5.11511690e-04]


In [63]:
row = X.iloc[0:1, :]
# predict the class label
yhat = model1.predict(row)
# summarize the predicted class
print('Predicted Class: %d' % yhat[0])

Predicted Class: 2


In [65]:
from sklearn.metrics import mean_squared_error
y_pred_test= model1.predict(X_test)
error_metric=mean_squared_error(y_pred=y_pred_test,y_true=y_test)

In [66]:
error_metric

0.19047619047619047

In [67]:
X

Unnamed: 0,oilhours,machinehours,PQI,Fe,Cu,Cr,Pb,Ni,Al,Si,Na,K,Mo,B,Ba,Mg,Ca,Zn,P,Ag,Mn,V,Ti,Cd,H2O,V40,OXI,NIT,SUL,ISO6,ISO14,X6,X10,X14,X21,X25,X38,X70,serialno_2,serialno_3,serialno_4,serialno_5,serialno_6,serialno_7,serialno_8,compartid_118,compartid_146,compartid_172,compartid_317,compartid_326,compartid_364,compartid_422,compartid_423,compartid_424,compartid_425,compartid_432,compartid_433,oiltypeid_1-2,oiltypeid_1-3,oiltypeid_2-1,oiltypeid_2-2,oilgradeid_30,oilgradeid_68,oilgradeid_75W-90,oilgradeid_85W-140
0,167,10421,152,47.0,0.5,0.5,0.5,0.5,0.5,18.0,2.0,0.5,0.5,6.0,0.5,16.0,3928.0,1002.0,835,0.5,0.5,0.5,0.5,0.5,0.0,346.0,12,5,17,23.00000,21.00000,40781.000000,27795.000000,19905.000000,9614.000000,741.000000,219.00000,14.000000,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
1,789,10421,8,26.0,10.0,0.5,0.5,0.5,1.0,11.0,70.0,2.0,0.5,3.0,0.5,18.0,3505.0,1018.0,847,0.5,0.5,0.5,0.5,0.5,0.3,111.5,11,5,15,23.00000,22.00000,48072.000000,35001.000000,26354.000000,13472.000000,584.000000,112.00000,4.000000,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,314,10254,0,25.0,4.0,3.0,6.0,0.5,0.5,4.0,8.0,0.5,51.0,1.0,0.5,936.0,1329.0,1342.0,1143,0.5,0.5,0.5,0.5,0.5,0.0,119.9,19,11,23,20.37931,17.08046,18057.204023,6650.853448,3306.968391,1304.574713,318.916667,33.37069,1.988506,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,622,10254,6,24.0,0.5,0.5,0.5,0.5,1.0,9.0,22.0,0.5,0.5,437.0,0.5,1009.0,60.0,17.0,1668,0.5,0.5,0.5,0.5,0.5,0.0,94.8,59,4,61,20.00000,16.00000,7425.000000,1076.000000,349.000000,73.000000,1.000000,0.00000,0.000000,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,622,10254,62,552.0,18.0,7.0,0.5,5.0,5.0,20.0,56.0,2.0,2.0,426.0,0.5,1002.0,69.0,54.0,1644,0.5,4.0,0.5,0.5,0.5,0.0,93.6,58,4,60,23.00000,22.00000,54655.000000,34752.000000,22237.000000,7235.000000,30.000000,4.00000,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,89,89,4,5.0,0.5,0.5,0.5,0.5,0.5,7.0,2.0,0.5,0.5,430.0,0.5,927.0,21.0,8.0,1508,0.5,0.5,0.5,0.5,0.5,0.0,95.5,53,4,63,21.00000,19.00000,14698.000000,5841.000000,3098.000000,1055.000000,72.000000,20.00000,1.000000,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
416,89,89,332,40.0,3.0,0.5,0.5,0.5,0.5,6.0,1.0,0.5,0.5,19.0,0.5,16.0,4.0,19.0,951,0.5,0.5,0.5,0.5,0.5,0.0,341.8,5,4,16,23.00000,22.00000,41310.000000,32063.000000,25758.000000,15704.000000,1696.000000,414.00000,21.000000,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
417,89,89,1,2.0,9.0,0.5,2.0,2.0,0.5,7.0,1.0,0.5,23.0,111.0,0.5,117.0,2977.0,1042.0,918,0.5,0.5,0.5,0.5,0.5,0.0,108.5,10,4,15,17.00000,15.00000,898.000000,296.000000,172.000000,75.000000,11.000000,2.00000,0.000000,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
418,89,89,2,10.0,11.0,0.5,0.5,0.5,0.5,12.0,3.0,2.0,55.0,135.0,0.5,374.0,1407.0,1032.0,942,0.5,0.5,0.5,0.5,0.5,0.0,100.6,12,5,16,20.37931,17.08046,18057.204023,6650.853448,3306.968391,1304.574713,318.916667,33.37069,1.988506,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
