
#  **Model Building End-to-End Series**


### **Importing Libraries**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

---
<a name = Section4></a>
# **4. Loading data using Pandas**
---

In [2]:
data = pd.read_csv("Data.csv")
data.head()

Unnamed: 0,Age,Sex,Height,Weight,Status
0,0,1,45,0.5,1
1,0,1,45,0.6,1
2,0,1,45,0.7,1
3,0,1,45,0.8,1
4,0,1,45,0.9,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041192 entries, 0 to 1041191
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   Age     1041192 non-null  int64  
 1   Sex     1041192 non-null  int64  
 2   Height  1041192 non-null  int64  
 3   Weight  1041192 non-null  float64
 4   Status  1041192 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 39.7 MB


---
<a name = Section4></a>
# **5. Separating data into train and test sets**
---

<a id=section5></a>
## 5.1 Separating Independent and Dependent variables

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [10]:
X = data.drop('Status', axis = 1)
y = data['Status']

In [11]:
X.shape

(1041192, 4)

In [12]:
y.shape

(1041192,)

<a id=section5></a>
## 5.2 Splitting data into train and test set.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

---
<a name = Section6></a>
# **6. Model Building**
---

In [14]:
# Decision Tree
decision_tree = DecisionTreeClassifier(max_depth = 9, random_state = 123,
                                       splitter = "best", criterion = "gini")
scores = cross_val_score(estimator=decision_tree, X=X_train, y=y_train)
print(scores)
print("Mean", scores.mean())

[0.93851409 0.93526661 0.93625706 0.93419173 0.93484003]
Mean 0.9358139023497722


In [15]:
# K – Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
accuracy_score(y_test, y_pred_knn)

0.9662839333650277

###  **Saving the model as Pickle String**

In [16]:
import pickle

In [17]:
pickle.dump(knn_classifier, open("randomforest.pkl","wb"))