In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


### Read the datasets

In [2]:
import pandas
ha_df = pandas.read_csv("/content/drive/My Drive/datasets/ha.csv")
hb_df = pandas.read_csv("/content/drive/My Drive/datasets/hb.csv")
hc_df = pandas.read_csv("/content/drive/My Drive/datasets/hc.csv")
hd_df = pandas.read_csv("/content/drive/My Drive/datasets/hd.csv")

In [3]:
ha_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1923 entries, 0 to 1922
Columns: 145 entries, AGE to FETUS1MECONIUMASPIRATIONSYNDROME
dtypes: float64(13), int64(8), object(124)
memory usage: 2.1+ MB


In [4]:
hb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893 entries, 0 to 892
Columns: 145 entries, AGE to FETUS1MECONIUMASPIRATIONSYNDROME
dtypes: float64(38), int64(9), object(98)
memory usage: 1011.7+ KB


In [5]:
hc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2458 entries, 0 to 2457
Columns: 145 entries, AGE to FETUS1MECONIUMASPIRATIONSYNDROME
dtypes: float64(22), int64(9), object(114)
memory usage: 2.7+ MB


In [6]:
hd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Columns: 145 entries, AGE to FETUS1MECONIUMASPIRATIONSYNDROME
dtypes: float64(42), int64(13), object(90)
memory usage: 1000.4+ KB


### Concatenating the datasets

In [7]:
c_df = pandas.concat([ha_df, hb_df, hc_df, hd_df], ignore_index=True)
c_df

Unnamed: 0,AGE,COUPLESITUATION,HEIGHT,WEIGHT,BMI,COUNTRYOFORIGYN,MATERNALEDUCATION,STARTANTENATALCARRE,SUBSTANCEABUSE,SMOKING,...,HEMATOMALDRAINAGE,INTRAPARTUMpHvalue,sexfetus1,weightfetus1,apgarfetus1,apgarfetus1fivemin,pHvaluefetus1umbilicalarthery,FETUS1ADMISSIONICU,FETUS1RECOVERY,FETUS1MECONIUMASPIRATIONSYNDROME
0,30,withcouple,1.58,70.0,28.04,CHILE,secondary,1ºtrimester,f,f,...,,,Masculino,2640,7,9,7.34,f,1:Aspiraciónnasofaríngea,f
1,38,withcouple,1.61,79.0,30.48,ESPAÑA,secondary,1ºtrimester,f,f,...,,,Femenino,3040,8,9,7.26,f,0:noprecisa,f
2,25,withcouple,1.56,72.0,29.59,COLOMBIA,secondary,1ºtrimester,f,f,...,,,Femenino,3820,10,10,7.21,f,0:noprecisa,f
3,31,withcouple,1.62,54.0,20.58,ESPAÑA,secondary,1ºtrimester,f,f,...,,,Femenino,3390,9,10,7.21,f,0:noprecisa,f
4,28,withcouple,1.47,51.0,23.60,ESPAÑA,secondary,1ºtrimester,f,f,...,,,Masculino,3020,8,9,7.17,f,0:noprecisa,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6152,35,withcouple,1.56,49.0,20.00,ESPAÑA,secondary,1ºtrimester,f,f,...,1960,10,10,7.27,f,,f,,,
6153,29,withcouple,1.64,59.0,21.51,ESPAÑA,secondary,1ºtrimester,f,f,...,3310,9,10,DESCONOCIDO,f,,f,,,
6154,39,withcouple,1.59,64.0,20.00,ESPAÑA,secondary,1ºtrimester,f,f,...,3830,8,9,DESCONOCIDO,f,,f,,,
6155,41,withcouple,1.48,50.0,23.28,ESPAÑA,primary,1ºtrimester,f,f,...,2810,10,10,DESCONOCIDO,f,,f,,,


### Information of the concatenated dataset

In [8]:
c_df.info(145)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6157 entries, 0 to 6156
Data columns (total 145 columns):
 #    Column                                  Dtype  
---   ------                                  -----  
 0    AGE                                     int64  
 1    COUPLESITUATION                         object 
 2    HEIGHT                                  float64
 3    WEIGHT                                  float64
 4    BMI                                     float64
 5    COUNTRYOFORIGYN                         object 
 6    MATERNALEDUCATION                       object 
 7    STARTANTENATALCARRE                     object 
 8    SUBSTANCEABUSE                          object 
 9    SMOKING                                 object 
 10   CIGARRETTESPERDAY                       float64
 11   ALCOHOL                                 object 
 12   DAILYALCOHOLINTAKE                      object 
 13   KGINCREASEDPREGNANCY                    int64  
 14   ART                   

### Checking missing values

In [9]:
c_df.isnull().sum()

AGE                                    0
COUPLESITUATION                        0
HEIGHT                                 0
WEIGHT                                 0
BMI                                    0
                                    ... 
apgarfetus1fivemin                  4219
pHvaluefetus1umbilicalarthery          0
FETUS1ADMISSIONICU                  4238
FETUS1RECOVERY                      4235
FETUS1MECONIUMASPIRATIONSYNDROME    4234
Length: 145, dtype: int64

### Class labels

In [10]:
c_df.TYPEOFBIRTH.unique()

array(['emergencyc-section', 'CESprogrammed', 'EUT', 'vacum', 'FORC',
       'ESP', 'NALGASVAGINAL'], dtype=object)

In [11]:
a = c_df['TYPEOFBIRTH'].values.tolist()
s = set(a)
for x in s:
  print(x,":",a.count(x))

emergencyc-section : 692
CESprogrammed : 325
FORC : 87
EUT : 4231
ESP : 44
NALGASVAGINAL : 8
vacum : 770


### Preprocessing

### Dataset after feature extraction

In [12]:
new_df = c_df[['PREVIOUSCESAREAN','COMPLICATIONS','ROBSONGROUP','ARTMODE','PREVIOUSPRETERMPREGNANCIES','AMNIOCENTESIS','PREINDUCTION','INDUCTION',
        'PARITY','OBSTETRICRISK','COMORBIDITY','NUMBEROFPREVCESAREAN','KGINCREASEDPREGNANCY','STARTANTENATALCARRE',
        'ART','PREVIOUSTERMPREGNANCIES','AMNIOTICLIQUID','MISCARRIAGES','ANESTHESIA','EPISIOTOMY','OXYTOCIN','FetalINTRAPARTUMpH',
        'GESTAGIONALAGE','HEIGHT','WEIGHT','BMI','AGE','CARDIOTOCOGRAPHY','MATERNALEDUCATION','SUBSTANCEABUSE','SMOKING','ALCOHOL','TYPEOFBIRTH']]

### Checking duplicates

In [13]:
new_df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
6152    False
6153    False
6154    False
6155    False
6156    False
Length: 6157, dtype: bool

In [14]:
new_df.duplicated().sum()

17

### Drop duplicate records

In [15]:
new_df1 = new_df.drop_duplicates()
new_df1

Unnamed: 0,PREVIOUSCESAREAN,COMPLICATIONS,ROBSONGROUP,ARTMODE,PREVIOUSPRETERMPREGNANCIES,AMNIOCENTESIS,PREINDUCTION,INDUCTION,PARITY,OBSTETRICRISK,...,HEIGHT,WEIGHT,BMI,AGE,CARDIOTOCOGRAPHY,MATERNALEDUCATION,SUBSTANCEABUSE,SMOKING,ALCOHOL,TYPEOFBIRTH
0,f,f,group2a,,0,f,t,t,0,t,...,1.58,70.0,28.04,30,continuous,secondary,f,f,f,emergencyc-section
1,f,f,group6,,0,f,f,f,10,f,...,1.61,79.0,30.48,38,continuous,secondary,f,f,f,CESprogrammed
2,f,f,group1,,0,f,f,f,10,f,...,1.56,72.0,29.59,25,continuous,secondary,f,f,f,EUT
3,t,f,group5,,0,f,f,f,1001,t,...,1.62,54.0,20.58,31,continuous,secondary,f,f,f,EUT
4,f,f,group1,FIV,0,f,f,f,10,f,...,1.47,51.0,23.60,28,continuous,secondary,f,f,f,EUT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6152,f,f,group8,,0,t,f,f,0,t,...,1.56,49.0,20.00,35,continuous,secondary,f,f,f,EUT
6153,f,f,group3,,0,f,f,f,1001,f,...,1.64,59.0,21.51,29,continuous,secondary,f,f,f,EUT
6154,f,f,group1,IAD,0,f,f,f,20,f,...,1.59,64.0,20.00,39,discontinuous,secondary,f,f,t,EUT
6155,f,f,group2a,FIV,0,f,f,t,1020,t,...,1.48,50.0,23.28,41,continuous,primary,f,f,f,emergencyc-section


In [16]:
import numpy as np
new_df1 = new_df1.replace(r'^\s*$', np.nan, regex=True)

### Handling missing values of categorical attributes by mode

In [17]:
s = (new_df1.dtypes == 'object')
#print(s)
#print(s[s])
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
print(len(object_cols))

Categorical variables:
['PREVIOUSCESAREAN', 'COMPLICATIONS', 'ROBSONGROUP', 'ARTMODE', 'AMNIOCENTESIS', 'PREINDUCTION', 'INDUCTION', 'OBSTETRICRISK', 'COMORBIDITY', 'STARTANTENATALCARRE', 'ART', 'AMNIOTICLIQUID', 'ANESTHESIA', 'EPISIOTOMY', 'OXYTOCIN', 'FetalINTRAPARTUMpH', 'CARDIOTOCOGRAPHY', 'MATERNALEDUCATION', 'SUBSTANCEABUSE', 'SMOKING', 'ALCOHOL', 'TYPEOFBIRTH']
22


In [18]:
for i in object_cols:
  x = new_df1[i].mode()[0]
  new_df1[i].fillna(x,inplace = True)
new_df1

Unnamed: 0,PREVIOUSCESAREAN,COMPLICATIONS,ROBSONGROUP,ARTMODE,PREVIOUSPRETERMPREGNANCIES,AMNIOCENTESIS,PREINDUCTION,INDUCTION,PARITY,OBSTETRICRISK,...,HEIGHT,WEIGHT,BMI,AGE,CARDIOTOCOGRAPHY,MATERNALEDUCATION,SUBSTANCEABUSE,SMOKING,ALCOHOL,TYPEOFBIRTH
0,f,f,group2a,FIV,0,f,t,t,0,t,...,1.58,70.0,28.04,30,continuous,secondary,f,f,f,emergencyc-section
1,f,f,group6,FIV,0,f,f,f,10,f,...,1.61,79.0,30.48,38,continuous,secondary,f,f,f,CESprogrammed
2,f,f,group1,FIV,0,f,f,f,10,f,...,1.56,72.0,29.59,25,continuous,secondary,f,f,f,EUT
3,t,f,group5,FIV,0,f,f,f,1001,t,...,1.62,54.0,20.58,31,continuous,secondary,f,f,f,EUT
4,f,f,group1,FIV,0,f,f,f,10,f,...,1.47,51.0,23.60,28,continuous,secondary,f,f,f,EUT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6152,f,f,group8,FIV,0,t,f,f,0,t,...,1.56,49.0,20.00,35,continuous,secondary,f,f,f,EUT
6153,f,f,group3,FIV,0,f,f,f,1001,f,...,1.64,59.0,21.51,29,continuous,secondary,f,f,f,EUT
6154,f,f,group1,IAD,0,f,f,f,20,f,...,1.59,64.0,20.00,39,discontinuous,secondary,f,f,t,EUT
6155,f,f,group2a,FIV,0,f,f,t,1020,t,...,1.48,50.0,23.28,41,continuous,primary,f,f,f,emergencyc-section


### Handling missing values of Numerical attributes using mean

In [19]:
s = (new_df1.dtypes != 'object')
number_cols = list(s[s].index)
print("Numerical variables:")
print(number_cols)
print(len(number_cols))

Numerical variables:
['PREVIOUSPRETERMPREGNANCIES', 'PARITY', 'NUMBEROFPREVCESAREAN', 'KGINCREASEDPREGNANCY', 'PREVIOUSTERMPREGNANCIES', 'MISCARRIAGES', 'GESTAGIONALAGE', 'HEIGHT', 'WEIGHT', 'BMI', 'AGE']
11


In [20]:
for i in number_cols:
  x = new_df1[i].mean()
  new_df1[i].fillna(x,inplace = True)
new_df1

Unnamed: 0,PREVIOUSCESAREAN,COMPLICATIONS,ROBSONGROUP,ARTMODE,PREVIOUSPRETERMPREGNANCIES,AMNIOCENTESIS,PREINDUCTION,INDUCTION,PARITY,OBSTETRICRISK,...,HEIGHT,WEIGHT,BMI,AGE,CARDIOTOCOGRAPHY,MATERNALEDUCATION,SUBSTANCEABUSE,SMOKING,ALCOHOL,TYPEOFBIRTH
0,f,f,group2a,FIV,0,f,t,t,0,t,...,1.58,70.0,28.04,30,continuous,secondary,f,f,f,emergencyc-section
1,f,f,group6,FIV,0,f,f,f,10,f,...,1.61,79.0,30.48,38,continuous,secondary,f,f,f,CESprogrammed
2,f,f,group1,FIV,0,f,f,f,10,f,...,1.56,72.0,29.59,25,continuous,secondary,f,f,f,EUT
3,t,f,group5,FIV,0,f,f,f,1001,t,...,1.62,54.0,20.58,31,continuous,secondary,f,f,f,EUT
4,f,f,group1,FIV,0,f,f,f,10,f,...,1.47,51.0,23.60,28,continuous,secondary,f,f,f,EUT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6152,f,f,group8,FIV,0,t,f,f,0,t,...,1.56,49.0,20.00,35,continuous,secondary,f,f,f,EUT
6153,f,f,group3,FIV,0,f,f,f,1001,f,...,1.64,59.0,21.51,29,continuous,secondary,f,f,f,EUT
6154,f,f,group1,IAD,0,f,f,f,20,f,...,1.59,64.0,20.00,39,discontinuous,secondary,f,f,t,EUT
6155,f,f,group2a,FIV,0,f,f,t,1020,t,...,1.48,50.0,23.28,41,continuous,primary,f,f,f,emergencyc-section


In [21]:
new_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6140 entries, 0 to 6156
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PREVIOUSCESAREAN            6140 non-null   object 
 1   COMPLICATIONS               6140 non-null   object 
 2   ROBSONGROUP                 6140 non-null   object 
 3   ARTMODE                     6140 non-null   object 
 4   PREVIOUSPRETERMPREGNANCIES  6140 non-null   int64  
 5   AMNIOCENTESIS               6140 non-null   object 
 6   PREINDUCTION                6140 non-null   object 
 7   INDUCTION                   6140 non-null   object 
 8   PARITY                      6140 non-null   int64  
 9   OBSTETRICRISK               6140 non-null   object 
 10  COMORBIDITY                 6140 non-null   object 
 11  NUMBEROFPREVCESAREAN        6140 non-null   float64
 12  KGINCREASEDPREGNANCY        6140 non-null   int64  
 13  STARTANTENATALCARRE         6140 

### Checking NULL values

In [22]:
new_df1.isnull().sum()

PREVIOUSCESAREAN              0
COMPLICATIONS                 0
ROBSONGROUP                   0
ARTMODE                       0
PREVIOUSPRETERMPREGNANCIES    0
AMNIOCENTESIS                 0
PREINDUCTION                  0
INDUCTION                     0
PARITY                        0
OBSTETRICRISK                 0
COMORBIDITY                   0
NUMBEROFPREVCESAREAN          0
KGINCREASEDPREGNANCY          0
STARTANTENATALCARRE           0
ART                           0
PREVIOUSTERMPREGNANCIES       0
AMNIOTICLIQUID                0
MISCARRIAGES                  0
ANESTHESIA                    0
EPISIOTOMY                    0
OXYTOCIN                      0
FetalINTRAPARTUMpH            0
GESTAGIONALAGE                0
HEIGHT                        0
WEIGHT                        0
BMI                           0
AGE                           0
CARDIOTOCOGRAPHY              0
MATERNALEDUCATION             0
SUBSTANCEABUSE                0
SMOKING                       0
ALCOHOL 

In [23]:
new_df1.TYPEOFBIRTH.unique()

array(['emergencyc-section', 'CESprogrammed', 'EUT', 'vacum', 'FORC',
       'ESP', 'NALGASVAGINAL'], dtype=object)

In [24]:
a = new_df1['TYPEOFBIRTH'].values.tolist()
s = set(a)
for x in s:
  print(x,":",a.count(x))

emergencyc-section : 689
CESprogrammed : 325
FORC : 87
EUT : 4222
ESP : 44
NALGASVAGINAL : 8
vacum : 765


In [25]:
new_df1

Unnamed: 0,PREVIOUSCESAREAN,COMPLICATIONS,ROBSONGROUP,ARTMODE,PREVIOUSPRETERMPREGNANCIES,AMNIOCENTESIS,PREINDUCTION,INDUCTION,PARITY,OBSTETRICRISK,...,HEIGHT,WEIGHT,BMI,AGE,CARDIOTOCOGRAPHY,MATERNALEDUCATION,SUBSTANCEABUSE,SMOKING,ALCOHOL,TYPEOFBIRTH
0,f,f,group2a,FIV,0,f,t,t,0,t,...,1.58,70.0,28.04,30,continuous,secondary,f,f,f,emergencyc-section
1,f,f,group6,FIV,0,f,f,f,10,f,...,1.61,79.0,30.48,38,continuous,secondary,f,f,f,CESprogrammed
2,f,f,group1,FIV,0,f,f,f,10,f,...,1.56,72.0,29.59,25,continuous,secondary,f,f,f,EUT
3,t,f,group5,FIV,0,f,f,f,1001,t,...,1.62,54.0,20.58,31,continuous,secondary,f,f,f,EUT
4,f,f,group1,FIV,0,f,f,f,10,f,...,1.47,51.0,23.60,28,continuous,secondary,f,f,f,EUT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6152,f,f,group8,FIV,0,t,f,f,0,t,...,1.56,49.0,20.00,35,continuous,secondary,f,f,f,EUT
6153,f,f,group3,FIV,0,f,f,f,1001,f,...,1.64,59.0,21.51,29,continuous,secondary,f,f,f,EUT
6154,f,f,group1,IAD,0,f,f,f,20,f,...,1.59,64.0,20.00,39,discontinuous,secondary,f,f,t,EUT
6155,f,f,group2a,FIV,0,f,f,t,1020,t,...,1.48,50.0,23.28,41,continuous,primary,f,f,f,emergencyc-section


In [26]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#new_df1.Publisher = new_df1.Publisher.astype(str)

for column in new_df1.columns:
    temp_new = le.fit_transform(new_df1[column].astype('category'))
    new_df1.drop(labels=[column], axis="columns", inplace=True)
    new_df1[column] = temp_new

X = new_df1.loc[:,'PREVIOUSCESAREAN':'ALCOHOL']
Y = new_df1.loc[:,'TYPEOFBIRTH':]

In [27]:
from imblearn.over_sampling import ADASYN 
sm = ADASYN(sampling_strategy='auto', random_state=None, n_neighbors=5, n_jobs=1)
X_adassin_1, Y_adassin_1 = sm.fit_resample(X, Y)
print(X_adassin_1.shape)
print(Y_adassin_1.shape)

(29470, 32)
(29470, 1)


In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_adassin_1,Y_adassin_1,test_size=0.2,random_state=None)

In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23576 entries, 17533 to 7293
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   PREVIOUSCESAREAN            23576 non-null  int64
 1   COMPLICATIONS               23576 non-null  int64
 2   ROBSONGROUP                 23576 non-null  int64
 3   ARTMODE                     23576 non-null  int64
 4   PREVIOUSPRETERMPREGNANCIES  23576 non-null  int64
 5   AMNIOCENTESIS               23576 non-null  int64
 6   PREINDUCTION                23576 non-null  int64
 7   INDUCTION                   23576 non-null  int64
 8   PARITY                      23576 non-null  int64
 9   OBSTETRICRISK               23576 non-null  int64
 10  COMORBIDITY                 23576 non-null  int64
 11  NUMBEROFPREVCESAREAN        23576 non-null  int64
 12  KGINCREASEDPREGNANCY        23576 non-null  int64
 13  STARTANTENATALCARRE         23576 non-null  int64
 14  ART

In [30]:
Y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23576 entries, 17533 to 7293
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   TYPEOFBIRTH  23576 non-null  int64
dtypes: int64(1)
memory usage: 368.4 KB


In [31]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5894 entries, 16114 to 9031
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   PREVIOUSCESAREAN            5894 non-null   int64
 1   COMPLICATIONS               5894 non-null   int64
 2   ROBSONGROUP                 5894 non-null   int64
 3   ARTMODE                     5894 non-null   int64
 4   PREVIOUSPRETERMPREGNANCIES  5894 non-null   int64
 5   AMNIOCENTESIS               5894 non-null   int64
 6   PREINDUCTION                5894 non-null   int64
 7   INDUCTION                   5894 non-null   int64
 8   PARITY                      5894 non-null   int64
 9   OBSTETRICRISK               5894 non-null   int64
 10  COMORBIDITY                 5894 non-null   int64
 11  NUMBEROFPREVCESAREAN        5894 non-null   int64
 12  KGINCREASEDPREGNANCY        5894 non-null   int64
 13  STARTANTENATALCARRE         5894 non-null   int64
 14  ART 

In [32]:
Y_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5894 entries, 16114 to 9031
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   TYPEOFBIRTH  5894 non-null   int64
dtypes: int64(1)
memory usage: 92.1 KB


In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
model = DecisionTreeClassifier(criterion = 'entropy')
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
print(accuracy_score(Y_test,y_pred))
print(precision_score(Y_test,y_pred,pos_label='positive',average='weighted'))
print(recall_score(Y_test,y_pred,pos_label='positive',average='micro'))
print(f1_score(Y_test, y_pred,average='micro'))

0.8722429589412962
0.8702424148202551
0.8722429589412962
0.8722429589412962




In [34]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier()
RFC.fit(X_train, Y_train.values.ravel())
y_pred = RFC.predict(X_test)
print(accuracy_score(Y_test,y_pred))
print(precision_score(Y_test,y_pred,pos_label='positive',average='weighted'))
print(recall_score(Y_test,y_pred,pos_label='positive',average='micro'))
print(f1_score(Y_test, y_pred,average='micro'))

0.9348489989820156
0.9361726437614679
0.9348489989820156
0.9348489989820156




In [35]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svc = SVC(gamma = 0.001, C = 1000)
y_pred = svc.fit(X_train, Y_train.values.ravel()).predict(X_test)
print(accuracy_score(Y_test,y_pred))
print(precision_score(Y_test,y_pred,pos_label='positive',average='weighted'))
print(recall_score(Y_test,y_pred,pos_label='positive',average='micro'))
print(f1_score(Y_test, y_pred,average='micro'))

0.9289107567017306
0.9283448285596893
0.9289107567017306
0.9289107567017307




In [36]:
from sklearn.neighbors import KNeighborsClassifier
K = []
training = []
test = []
scores = {}
  
for k in range(2, 21):
    clf = KNeighborsClassifier(n_neighbors = k)
    clf.fit(X_train, Y_train.values.ravel())
  
    training_score = clf.score(X_train, Y_train)
    test_score = clf.score(X_test, Y_test)
    K.append(k)
  
    training.append(training_score)
    test.append(test_score)
    scores[k] = [training_score, test_score]
for keys, values in scores.items():
    print(keys, ':', values)

2 : [0.9519002375296912, 0.8456057007125891]
3 : [0.916270783847981, 0.832711231761113]
4 : [0.8927723108245673, 0.8125212080081439]
5 : [0.87160671869698, 0.8006447234475738]
6 : [0.8525195113674924, 0.7840176450627757]
7 : [0.8365710892432983, 0.7701051917203936]
8 : [0.8213437393959959, 0.7621309806582965]
9 : [0.8095520868680014, 0.7517814726840855]
10 : [0.7957244655581948, 0.7441465897522904]
11 : [0.7851204614862572, 0.7349847302341365]
12 : [0.7720987444859179, 0.7278588394977944]
13 : [0.7628520529351883, 0.7192059721750933]
14 : [0.754071937563624, 0.7139463861554123]
15 : [0.744740413980319, 0.7059721750933152]
16 : [0.7363420427553444, 0.7017305734645403]
17 : [0.7291313199864269, 0.6878181201221581]
18 : [0.7209026128266033, 0.682049541907024]
19 : [0.7147947064811673, 0.675941635561588]
20 : [0.7064387512724805, 0.670342721411605]


In [37]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=2,weights='distance',p=1)
model.fit(X_train,Y_train.values.ravel())
y_pred = model.predict(X_test)
print(accuracy_score(Y_test,y_pred))
print(precision_score(Y_test,y_pred,pos_label='positive',average='weighted'))
print(recall_score(Y_test,y_pred,pos_label='positive',average='micro'))
print(f1_score(Y_test, y_pred,average='micro'))

0.9073634204275535
0.9090936598653978
0.9073634204275535
0.9073634204275535




In [38]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
estimators = [('rf', RandomForestClassifier()),('svr', make_pipeline(StandardScaler(),LinearSVC())),('dt',DecisionTreeClassifier(criterion = 'entropy')),('knn',KNeighborsClassifier(n_neighbors=2,weights='distance',p=1))]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf.fit(X_train, Y_train.values.ravel())
y_pred = clf.predict(X_test)
print(accuracy_score(Y_test,y_pred))
print(precision_score(Y_test,y_pred,pos_label='positive',average='weighted'))
print(recall_score(Y_test,y_pred,pos_label='positive',average='micro'))
print(f1_score(Y_test, y_pred,average='micro'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9675941635561588
0.9677414854439242
0.9675941635561588
0.9675941635561588


