### Importing Libraries

In [2]:
import numpy as np 
import pandas as pd

from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

### Dataset loading

In [5]:
df = pd.read_csv("InsuranceDataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 24 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   Area_Service                           1046119 non-null  object 
 1   Hospital County                        1046119 non-null  object 
 2   Hospital Id                            1046119 non-null  float64
 3   Age                                    1048575 non-null  object 
 4   Gender                                 1048575 non-null  object 
 5   Cultural_group                         1048575 non-null  object 
 6   ethnicity                              1048575 non-null  object 
 7   Days_spend_hsptl                       1048575 non-null  object 
 8   Admission_type                         1048575 non-null  object 
 9   Home or self care,                     1048575 non-null  object 
 10  ccs_diagnosis_code                     104

### Data Preprocessing

#### 1. Drop Duplicates 

In [6]:
df.drop_duplicates(inplace=True,ignore_index=True)
df.describe().apply(lambda s: s.apply('{0:.1f}'.format))

Unnamed: 0,Hospital Id,ccs_diagnosis_code,ccs_procedure_code,Code_illness,Mortality risk,Weight_baby,Tot_charg,Tot_cost,ratio_of_total_costs_to_total_charges,Result,Payment_Typology
count,1041306.0,1043761.0,1043761.0,1043761.0,1043720.0,1043761.0,1043761.0,1043761.0,1043761.0,1043761.0,1043761.0
mean,652.5,198.6,97.1,2.1,1.8,282.0,27190.1,10501.2,0.5,0.7,1.9
std,669.6,165.5,87.3,0.9,1.0,932.3,54976.5,22345.9,0.8,0.4,0.8
min,1.0,1.0,0.0,0.0,1.0,0.0,0.3,0.1,0.0,0.0,1.0
25%,213.0,108.0,0.0,1.0,1.0,0.0,7473.6,3262.2,0.3,0.0,1.0
50%,630.0,159.0,88.0,2.0,1.0,0.0,14669.2,5955.4,0.4,1.0,2.0
75%,977.0,218.0,172.0,3.0,3.0,0.0,29242.3,11377.4,0.5,1.0,3.0
max,9250.0,670.0,231.0,4.0,4.0,9000.0,6196973.5,2562477.7,157.6,1.0,5.0


#### 2. Drop NA

In [7]:
df.dropna(inplace=True)
df.isnull().any()

Area_Service                             False
Hospital County                          False
Hospital Id                              False
Age                                      False
Gender                                   False
Cultural_group                           False
ethnicity                                False
Days_spend_hsptl                         False
Admission_type                           False
Home or self care,                       False
ccs_diagnosis_code                       False
ccs_procedure_code                       False
apr_drg_description                      False
Code_illness                             False
Mortality risk                           False
Surg_Description                         False
Weight_baby                              False
Abortion                                 False
Emergency dept_yes/No                    False
Tot_charg                                False
Tot_cost                                 False
ratio_of_tota

#### 3. Drop Columns

In [8]:
df.drop(['Area_Service','Hospital County','Hospital Id','Gender','Cultural_group',"ethnicity","apr_drg_description","Weight_baby",'Abortion','Tot_charg','Tot_cost'], axis=1, inplace=True)
#,'Tot_charg','Tot_cost'

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041265 entries, 0 to 1043760
Data columns (total 13 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   Age                                    1041265 non-null  object 
 1   Days_spend_hsptl                       1041265 non-null  object 
 2   Admission_type                         1041265 non-null  object 
 3   Home or self care,                     1041265 non-null  object 
 4   ccs_diagnosis_code                     1041265 non-null  int64  
 5   ccs_procedure_code                     1041265 non-null  int64  
 6   Code_illness                           1041265 non-null  int64  
 7   Mortality risk                         1041265 non-null  float64
 8   Surg_Description                       1041265 non-null  object 
 9   Emergency dept_yes/No                  1041265 non-null  object 
 10  ratio_of_total_costs_to_total_charges  104

#### 4. Converting Columns to Right Datatypes

In [10]:
df.replace({'Days_spend_hsptl': '120 +'}, 120, inplace=True)
df['Mortality risk'] = df['Mortality risk'].astype(int)

#### 5. Label Encoding

In [11]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
labelencoder = LabelEncoder()
label_binizer =LabelBinarizer()

In [12]:
df['Age']  = labelencoder.fit_transform(df['Age'])
df['Admission_type'] = labelencoder.fit_transform(df['Admission_type'])
df['Home or self care,'] = labelencoder.fit_transform(df['Home or self care,'])

In [13]:
df['Surg_Description'] = label_binizer.fit_transform(df['Surg_Description'])
df['Emergency dept_yes/No'] = label_binizer.fit_transform(df['Emergency dept_yes/No'])

In [11]:
#df['Age']  = labelencoder.fit_transform(df['Age'])
#df['Admission_type'] = labelencoder.fit_transform(df['Admission_type'])
#df['Home or self care,'] = labelencoder.fit_transform(df['Home or self care,'])
#df['Surg_Description'] = labelencoder.fit_transform(df['Surg_Description'])
#df['Emergency dept_yes/No'] = labelencoder.fit_transform(df['Emergency dept_yes/No'])

In [12]:
df.head(10)

Unnamed: 0,Age,Days_spend_hsptl,Admission_type,"Home or self care,",ccs_diagnosis_code,ccs_procedure_code,Code_illness,Mortality risk,Surg_Description,Emergency dept_yes/No,ratio_of_total_costs_to_total_charges,Result,Payment_Typology
0,2,4,0,7,122,0,1,1,0,1,1.012798,1,1
1,4,4,5,17,197,0,3,2,0,1,1.079365,1,1
2,2,3,5,7,122,0,1,1,0,1,1.059379,1,1
3,0,1,5,7,122,0,1,1,0,1,1.484167,0,1
4,4,3,0,7,122,0,2,3,0,1,0.986161,1,1
5,0,1,0,7,142,0,1,1,0,1,1.384794,1,1
6,1,3,0,7,122,0,2,1,0,1,1.066565,1,1
7,4,1,0,7,154,202,2,3,0,1,1.37115,1,1
8,0,1,0,7,125,0,1,1,0,0,1.128782,1,1
9,3,2,0,7,122,0,2,1,0,1,1.090095,0,1


In [13]:
X = df.iloc[:,0:-1]    
Y = df.iloc[:,-1]

In [14]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=42,contamination=0.1)
clf.fit(X,Y)
y_outliers = clf.predict(X)
df['y_outliers'] = y_outliers
df['y_train'] = Y
df[df['y_outliers']==-1]

Unnamed: 0,Age,Days_spend_hsptl,Admission_type,"Home or self care,",ccs_diagnosis_code,ccs_procedure_code,Code_illness,Mortality risk,Surg_Description,Emergency dept_yes/No,ratio_of_total_costs_to_total_charges,Result,Payment_Typology,y_outliers,y_train
1,4,4,5,17,197,0,3,2,0,1,1.079365,1,1,-1,1
3,0,1,5,7,122,0,1,1,0,1,1.484167,0,1,-1,1
11,3,2,0,7,122,202,3,3,0,1,1.192205,0,1,-1,1
22,0,1,5,7,122,0,1,1,0,1,1.328677,0,1,-1,1
29,2,1,0,17,244,0,1,1,0,1,1.346271,1,1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043730,4,30,1,4,13,222,4,4,0,1,0.242338,1,3,-1,3
1043732,2,19,1,8,116,75,4,4,1,0,0.214560,1,3,-1,3
1043735,3,16,1,4,151,216,4,4,0,1,0.208465,0,3,-1,3
1043745,3,15,1,18,237,5,1,1,1,0,0.232809,1,3,-1,3


In [15]:
df.drop(df.loc[df['y_outliers']==-1].index, inplace=True)

In [16]:
#df = df.drop(columns=['y_outliers','y_train'])
df = df[['Age','Days_spend_hsptl','Admission_type','Home or self care,','ccs_diagnosis_code','ccs_procedure_code','Code_illness','Mortality risk','Surg_Description','Emergency dept_yes/No','ratio_of_total_costs_to_total_charges','Payment_Typology','Result']]

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1041265 entries, 0 to 1043760
Data columns (total 13 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   Age                                    1041265 non-null  int32  
 1   Days_spend_hsptl                       1041265 non-null  object 
 2   Admission_type                         1041265 non-null  int32  
 3   Home or self care,                     1041265 non-null  int32  
 4   ccs_diagnosis_code                     1041265 non-null  int64  
 5   ccs_procedure_code                     1041265 non-null  int64  
 6   Code_illness                           1041265 non-null  int64  
 7   Mortality risk                         1041265 non-null  int32  
 8   Surg_Description                       1041265 non-null  int32  
 9   Emergency dept_yes/No                  1041265 non-null  int32  
 10  ratio_of_total_costs_to_total_charges  104

In [15]:
#df.to_csv(r'Example1.csv')

### Data Standardization

In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
df_array = scaler.transform(df)
df = pd.DataFrame(df_array,columns =['Age','Days_spend_hsptl','Admission_type','Home or self care,','ccs_diagnosis_code','ccs_procedure_code','Code_illness','Mortality risk','Surg_Description','Emergency dept_yes/No','ratio_of_total_costs_to_total_charges','Payment_Typology','Result'])

In [17]:
df

Unnamed: 0,Age,Days_spend_hsptl,Admission_type,"Home or self care,",ccs_diagnosis_code,ccs_procedure_code,Code_illness,Mortality risk,Surg_Description,Emergency dept_yes/No,ratio_of_total_costs_to_total_charges,Payment_Typology,Result
0,0.50,0.025210,0.0,0.388889,0.180867,0.000000,0.000000,0.000000,0.0,1.0,0.006219,1.0,0.00
1,1.00,0.025210,1.0,0.944444,0.292975,0.000000,0.666667,0.333333,0.0,1.0,0.006642,1.0,0.00
2,0.50,0.016807,1.0,0.388889,0.180867,0.000000,0.000000,0.000000,0.0,1.0,0.006515,1.0,0.00
3,0.00,0.000000,1.0,0.388889,0.180867,0.000000,0.000000,0.000000,0.0,1.0,0.009211,0.0,0.00
4,1.00,0.016807,0.0,0.388889,0.180867,0.000000,0.333333,0.666667,0.0,1.0,0.006050,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041260,1.00,0.042017,0.2,1.000000,0.156951,0.125541,0.333333,0.666667,0.0,1.0,0.001056,1.0,0.50
1041261,0.00,0.008403,0.2,0.388889,0.010463,1.000000,0.333333,0.000000,0.0,0.0,0.001620,1.0,0.50
1041262,0.75,0.100840,0.2,0.833333,0.224215,0.380952,1.000000,1.000000,0.0,1.0,0.001291,0.0,0.50
1041263,0.50,0.008403,0.2,0.388889,0.212257,0.372294,0.000000,0.000000,1.0,1.0,0.001350,1.0,0.50


### Data Sampling

In [18]:
#Step1 - take a sample data from df. This is just to check the result of 2 methods (before/after train test split)
sample_data = df.sample(frac=0.02)
sample_data.Result.value_counts()

0.25    8440
0.00    6948
0.50    5359
0.75      76
1.00       2
Name: Result, dtype: int64

### Downsampling before train-test split

In [28]:
# seperating '0', & '1' label 
sample_minority = sample_data.loc[sample_data['Result']==0]
sample_majority = sample_data.loc[sample_data['Result']==1]

# doing dowmsampling of majority classes... taking 100 to 80% ratio
#sample_minority.shape[0]*0.80 - Here its 3300

from sklearn.utils import resample
sample_data_majority_downsampled = resample(sample_majority , replace=True, n_samples=3300, random_state=42)

sample_data_downsampled = pd.concat([sample_data_majority_downsampled, sample_minority ], ignore_index=True)

# seperating indepedent & depedent variabales..

x = sample_data_downsampled.drop(['Result'],axis=1).values
y = sample_data_downsampled.Result.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify = y, random_state=42)

### Build the model with n-estimator = 15

In [29]:
from sklearn.ensemble import RandomForestClassifier
sample_RF15_before_split_upsampled = RandomForestClassifier(n_estimators=15, random_state=42)

clf_15_sample = sample_RF15_before_split_upsampled.fit(x_train, y_train)
y_pred_train = clf_15_sample.predict(x_train)
y_pred = clf_15_sample.predict(x_test)
print(classification_report(y_train, y_pred_train ))

print(classification_report(y_test, y_pred ))

results = cross_val_score(clf_15_sample, x, y)
print(results.mean())

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4655
         1.0       1.00      1.00      1.00      2211

    accuracy                           1.00      6866
   macro avg       1.00      1.00      1.00      6866
weighted avg       1.00      1.00      1.00      6866

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2293
         1.0       1.00      1.00      1.00      1089

    accuracy                           1.00      3382
   macro avg       1.00      1.00      1.00      3382
weighted avg       1.00      1.00      1.00      3382

1.0


#### Downsampling after train-test split

In [30]:
x = sample_data.drop(['Result'],axis=1).values
y = sample_data.Result.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify = y, random_state=42)

x_train_df = pd.DataFrame(x_train, columns=sample_data.drop(['Result'],axis=1).columns)
y_train_df = pd.DataFrame(y_train, columns=['Result'])

df_train = pd.concat([x_train_df, y_train_df],axis=1)

df_train_majority = df_train.loc[df_train['Result']==1]
df_train_minority = df_train.loc[df_train['Result']==0]

# doing downsampling of majority classes... taking 100 to 80% ratio

# sample_minority.shape[0]*0.80   - 3287

from sklearn.utils import resample
df_train_majority_downsampled = resample(df_train_majority , replace=True, n_samples=3300, random_state=42)

after_split_downsampled = pd.concat([df_train_majority_downsampled,df_train_minority])


# splitting into x_train, y_train only..note: we have alerady splitted test sets using k fold...

x_train_downsampled = after_split_downsampled.drop(['Result'],axis=1).values
y_train_downsampled = after_split_downsampled.Result.values


#### Build the model with n-estimator = 15

In [31]:
from sklearn.ensemble import RandomForestClassifier
sample_RF15_after_split_downsampled = RandomForestClassifier(n_estimators=15, random_state=42)

sample_RF15_after_split_downsampled = sample_RF15_after_split_downsampled.fit(x_train_downsampled, y_train_downsampled)

y_pred_train = sample_RF15_after_split_downsampled.predict(x_train_downsampled)
y_pred = sample_RF15_after_split_downsampled.predict(x_test)

print(classification_report(x_train_downsampled, y_pred_train ))

print(classification_report(y_test, y_pred ))

results = cross_val_score(sample_RF15_after_split_downsampled, x, y)
print(results.mean())


ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

#### Method1 - Upsampling before train-test split

In [32]:
# seperating '0', & '1' label 
sample_minority = sample_data.loc[sample_data['Result']==0]
sample_majority = sample_data.loc[sample_data['Result']==1]

In [33]:
# doing upsampling of minority classes... taking 100 to 80% ratio
#sample_majority.shape[0]*0.80 - Here its 12452

from sklearn.utils import resample
sample_data_minority_upsampled = resample(sample_minority , replace=True, n_samples=12000, random_state=42)

In [34]:
# combining upsampled data to majority class data...

sample_data_upsampled = pd.concat([sample_majority, sample_data_minority_upsampled ], ignore_index=True)

In [35]:
# seperating indepedent & depedent variabales..

X= sample_data_upsampled.drop(['Result'],axis=1).values
Y= sample_data_upsampled.Result.values

In [36]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, stratify = Y, random_state=42)

### Build the model with n-estimator = 15

In [37]:
from sklearn.ensemble import RandomForestClassifier
sample_RF15_before_split_upsampled = RandomForestClassifier(n_estimators=15, random_state=42)

In [38]:
clf_15_sample = sample_RF15_before_split_upsampled.fit(x_train, y_train)
y_pred_train = clf_15_sample.predict(x_train)
y_pred = clf_15_sample.predict(x_test)
print(classification_report(y_train, y_pred_train ))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8040
         1.0       1.00      1.00      1.00         1

    accuracy                           1.00      8041
   macro avg       1.00      1.00      1.00      8041
weighted avg       1.00      1.00      1.00      8041



In [39]:
print(classification_report(y_test, y_pred ))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      3960
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      3961
   macro avg       0.50      0.50      0.50      3961
weighted avg       1.00      1.00      1.00      3961



In [40]:
results = cross_val_score(clf_15_sample, x, y)
print(results.mean())

nan


#### Method2 - Upsampling after train-test split

In [41]:
x = sample_data.drop(['Result'],axis=1).values
y = sample_data.Result.values

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify = y, random_state=42)

In [43]:
x_train_df = pd.DataFrame(x_train, columns=sample_data.drop(['Result'],axis=1).columns)
y_train_df = pd.DataFrame(y_train, columns=['Result'])

df_train = pd.concat([x_train_df, y_train_df],axis=1)

In [44]:
df_train_majority = df_train.loc[df_train['Result']==1]
df_train_minority = df_train.loc[df_train['Result']==0]

In [45]:
# doing upsampling of minority classes... taking 100 to 80% ratio

# sample_majority.shape[0]*0.80

from sklearn.utils import resample
df_train_minority_upsampled = resample(df_train_minority , replace=True, n_samples=12000, random_state=42)

In [46]:
after_split_upsampled = pd.concat([df_train_majority, df_train_minority_upsampled])

In [47]:
# splitting into x_train, y_train only..note: we have alerady splitted test sets using k fold...

x_train_upsampled = after_split_upsampled.drop(['Result'],axis=1).values
y_train_upsampled = after_split_upsampled.Result.values

### Build the model with n-estimator = 15

In [48]:
from sklearn.ensemble import RandomForestClassifier
sample_RF15_after_split_upsampled = RandomForestClassifier(n_estimators=15, random_state=42)

In [49]:
sample_RF15_after_split_upsampled = sample_RF15_after_split_upsampled.fit(x_train_upsampled, y_train_upsampled)

In [50]:
y_pred_train = sample_RF15_after_split_upsampled.predict(x_train_upsampled)
y_pred = sample_RF15_after_split_upsampled.predict(x_test)

In [51]:
print(classification_report(y_train_upsampled, y_pred_train ))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     12000
         1.0       1.00      1.00      1.00         1

    accuracy                           1.00     12001
   macro avg       1.00      1.00      1.00     12001
weighted avg       1.00      1.00      1.00     12001



In [52]:
print(classification_report(y_test, y_pred ))

ValueError: Classification metrics can't handle a mix of continuous and binary targets

In [53]:
results = cross_val_score(sample_RF15_after_split_upsampled, x, y)
print(results.mean())

nan


##### Conclusion:
##### Upsampling before Train-Test split gives more accuracy

### Model Building on whole dataset

In [54]:
data_minority = df.loc[df['Result']==0]
data_majority = df.loc[df['Result']==1]

In [55]:
data_majority.shape[0]*0.80

52.800000000000004

In [56]:
from sklearn.utils import resample
data_minority_upsampled = resample(data_minority, replace=True, n_samples=600000, random_state=42)
data_upsampled = pd.concat([data_majority, data_minority_upsampled ], ignore_index=True)

In [57]:
X = data_upsampled.drop(['Result'],axis=1).values
Y = data_upsampled.Result.values

In [58]:
Y
len(Y)
len(Y) - np.count_nonzero(Y)

600000

In [59]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, stratify=Y, random_state=30)

### GridSearch for Randon Forest

In [70]:
parameters = {
    "n_estimators":[50,100,250],
    "max_depth":[2,4,8,16,32,None]
}

In [71]:
from sklearn.model_selection import GridSearchCV


In [72]:
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')


In [74]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, stratify=Y, random_state=42)
model_rf_upsampled = RandomForestClassifier()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, stratify=Y, random_state=42)

#model_rf_upsampled = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf_upsampled = RandomForestClassifier()

cv = GridSearchCV(model_rf_upsampled,parameters,cv=5)
cv.fit(x_train,y_train)

display(cv)

### Random Forest Classifier on whole data 

In [75]:
from sklearn.metrics import mean_squared_error

model_rf_upsampled = RandomForestClassifier(n_estimators=100, random_state=42)


df_RF100_upsampled = model_rf_upsampled.fit(x_train, y_train)

y_pred_train = model_rf_upsampled.predict(x_train)
# classification report...for train set
print(classification_report(y_train, y_pred_train))

y_pred_test = model_rf_upsampled.predict(x_test)
# classification report...for test set
print(classification_report(y_test, y_pred_test))

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print("RF with full trees, Train MSE: {} Test MSE: {}".format(mse_train, mse_test))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    402000
         1.0       1.00      0.98      0.99        44

    accuracy                           1.00    402044
   macro avg       1.00      0.99      0.99    402044
weighted avg       1.00      1.00      1.00    402044

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    198000
         1.0       0.00      0.00      0.00        22

    accuracy                           1.00    198022
   macro avg       0.50      0.50      0.50    198022
weighted avg       1.00      1.00      1.00    198022

RF with full trees, Train MSE: 2.4872899483638606e-06 Test MSE: 0.00012119865469493288


### Testing the accuracy with Random data

In [76]:
import random
r1 = random.randint(100,100000)
r2 = r1 + 1

x_test1 = x_test[r1:r2,:] 
y_test1 = y_test[r1] 
y_pred_test1 = model_rf_upsampled.predict(x_test1)
y_pred_test1
y_test1

print ("Record-" + str(r1) + "   Predicted Value " + str(y_pred_test1) + "   Actual Value " + str(y_test1))


Record-59070   Predicted Value [0.]   Actual Value 0.0


### Save the model to .sav file

In [79]:
from joblib import dump, load
dump(df_RF100_upsampled, 'RF_WholeData_encoded.joblib')

['RF_WholeData_encoded.joblib']

In [80]:
#from pickle import dump
#from pickle import load


# save the model to disk
#dump(df_RF100_upsampled, open('RF_WholeData_encoded.sav', 'wb'))

# load the model from disk
loaded_model = load(open('RF_WholeData_encoded.joblib', 'rb'))
result = loaded_model.score(X, Y)
print(result)

0.9999583379161625


### Logistic Regression

In [81]:
#Evaluate using Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=500)

test_size = 0.33
seed = 5
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# grid searching for learning rate    
parameters = { 'C' : [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] }
          
model = LogisticRegression()        
grid = GridSearchCV(model, parameters )    
grid.fit(X_train, Y_train )
    
y_pred_train = grid.predict(X_train)
print(classification_report(Y_train,y_pred_train))

y_pred_test = grid.predict(X_test)
print(classification_report(Y_test,y_pred_test))

result = grid.score(X_test, Y_test)

result*100.0


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    402001
         1.0       0.00      0.00      0.00        43

    accuracy                           1.00    402044
   macro avg       0.50      0.50      0.50    402044
weighted avg       1.00      1.00      1.00    402044

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    197999
         1.0       0.00      0.00      0.00        23

    accuracy                           1.00    198022
   macro avg       0.50      0.50      0.50    198022
weighted avg       1.00      1.00      1.00    198022



99.98838512892507

### Dump the model into .sav file

In [None]:
df.info()

In [None]:
# Gaussian Naive Bayes Classification

from sklearn.naive_bayes import GaussianNB

kfold = KFold(n_splits=10)
test_size = 0.33
seed = 5
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

model = GaussianNB()
model.fit(X_train, Y_train)

y_pred_train = model.predict(X_train)
print(classification_report(Y_train,y_pred_train))

y_pred_test = model.predict(X_test)
print(classification_report(Y_test,y_pred_test))

#results = cross_val_score(model, X_train, Y_train, cv=kfold)
#print(results.mean())


In [None]:
# KNN Classification
from sklearn.neighbors import KNeighborsClassifier

kfold = KFold(n_splits=10)
test_size = 0.33
seed = 5
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

model = KNeighborsClassifier(n_neighbors=17)
model.fit(X_train, Y_train)

y_pred_train = model.predict(X_train)
print(classification_report(Y_train,y_pred_train))

y_pred_test = model.predict(X_test)
print(classification_report(Y_test,y_pred_test))

#results = cross_val_score(model, X_train, Y_train, cv=kfold)
#print(results.mean())


In [None]:
from sklearn.svm import SVC

kfold = KFold(n_splits=10)
test_size = 0.33
seed = 5
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

model = SVC()
model.fit(X_train, Y_train)

y_pred_train = model.predict(X_train)
print(classification_report(Y_train,y_pred_train))

y_pred_test = model.predict(X_test)
print(classification_report(Y_test,y_pred_test))

#results = cross_val_score(model, X_train, X_train, cv=kfold)
#print(results.mean())

In [None]:
# AdaBoost Classification
from sklearn.ensemble import AdaBoostClassifier

num_trees = 10
test_size = 0.33
seed = 5
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)


kfold = KFold(n_splits=10)
model = AdaBoostClassifier(n_estimators=num_trees)

model.fit(X_train, Y_train)

y_pred_train = model.predict(X_train)
print(classification_report(Y_train,y_pred_train))

y_pred_test = model.predict(X_test)
print(classification_report(Y_test,y_pred_test))

#results = cross_val_score(model, X, Y, cv=kfold)
#print(results.mean())

In [None]:
# Bagged Decision Trees for Classification

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

test_size = 0.33
seed = 5
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

kfold = KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees)

model.fit(X_train, Y_train)

y_pred_train = model.predict(X_train)
print(classification_report(Y_train,y_pred_train))

y_pred_test = model.predict(X_test)
print(classification_report(Y_test,y_pred_test))

#results = cross_val_score(model, X_train, Y_train, cv=kfold)
#print(results.mean())

In [None]:
len(X_test)