<a href="https://colab.research.google.com/github/ProfKaromo/Data_Science_Codes/blob/main/Classification_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Classification - Predicted variable is Categorical Variable  - 
# i.e Fraud/Not Fraud, Default/Not Default, Yes/No
import pandas
data = pandas.read_csv("https://modcom.co.ke/data/datasets/pima.csv")
data.head(2)

Unnamed: 0,Children,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,Diabetic
1,1,85,66,29,0,26.6,0.351,31,Not Diabetic


In [17]:
data.describe()

Unnamed: 0,Children,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [18]:
data.groupby('Outcome').size()

Outcome
Diabetic        268
Not Diabetic    500
dtype: int64

In [19]:
subset = data[['Glucose', 'Insulin',	'BMI'	, 'DiabetesPedigreeFunction',	'Age', 'Outcome']]

In [20]:
# Step 1: Split to X - independent  and Y - dependent
array = subset.values
X = array[:, 0:5]    # 8 is not counted here
Y = array[:, 5]
subset

Unnamed: 0,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,0,33.6,0.627,50,Diabetic
1,85,0,26.6,0.351,31,Not Diabetic
2,183,0,23.3,0.672,32,Diabetic
3,89,94,28.1,0.167,21,Not Diabetic
4,137,168,43.1,2.288,33,Diabetic
...,...,...,...,...,...,...
763,101,180,32.9,0.171,63,Not Diabetic
764,122,0,36.8,0.340,27,Not Diabetic
765,121,112,26.2,0.245,30,Not Diabetic
766,126,0,30.1,0.349,47,Diabetic


In [21]:
# Using Adaboost Classifier / Random Forest for Feature Elimination
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFE
estimator = AdaBoostClassifier(random_state=42, n_estimators=100)
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, Y)

filter = selector.support_
ranking = selector.ranking_

print("Mask data: ", filter)
print("Ranking: ", ranking)

Mask data:  [ True  True  True  True  True]
Ranking:  [1 1 1 1 1]


In [22]:
# Oversampling 
from imblearn.over_sampling import SMOTE
# transform the dataset
oversample = SMOTE()
oversampledX, oversampledY = oversample.fit_resample(X, Y)
#len(oversampledY)

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# define standard scaler used for standization around the mean of 0
scaler = StandardScaler()
# define minmax scaler used for standization between -1 to 1 or 0, 1
# transform data
scaledX = scaler.fit_transform(oversampledX)
#scaledX

In [24]:
# Step 2: Split to training set and testing set
# we have 768 records, 70% is used training the model, 30% used in testing the model(hideout)
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(scaledX, oversampledY, test_size=0.3, random_state=42)
# X_train consist of   Children	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age - 70%
# Y_train consist of Outcome - 70%

# X_test consist of Children	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age - 30%
# Y_test consist of Outcome - 30%

In [25]:
# Step 2a: Cross Validation Of Algorithms/Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB 
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('RFR', RandomForestClassifier()))
models.append(('SVM', SVC())) 
models.append(('NB', GaussianNB())) 
# KFOLD - Cross Validation
for name, model in models:
  kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)
  cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring = 'accuracy')
  print(name, cv_results.mean())

KNN 0.7657142857142857
DT 0.7385714285714287
GB 0.7957142857142856
RFR 0.7942857142857143
SVM 0.7628571428571428
NB 0.7285714285714284


In [26]:
# Step 3: Load Machine Learning Models/Algorithms
model = RandomForestClassifier()
model.fit(X_train, Y_train)
print('Training happens at this point..')

Training happens at this point..


In [27]:
# Step 4: Test your model
predictions = model.predict(X_test)
# print('Model Predictions ', predictions)
# print('Actual Values ', Y_test)

In [28]:
# Step 5 : Get metrics using accuracy
from sklearn.metrics import accuracy_score
print('Score  ', accuracy_score(Y_test, predictions))

# Problem - 1. Unbalanced data, 2. Data is not well standadized, 3. Few Records , 4. Some features might be noise.

Score   0.8133333333333334


In [29]:
# At 74% Confidence
person = [[85,	0, 26.6,	0.351,	31]]
outcome = model.predict(person)
print('The outcome is likely to be ', outcome)

The outcome is likely to be  ['Diabetic']


In [30]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, predictions))


from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test, predictions))

              precision    recall  f1-score   support

    Diabetic       0.78      0.87      0.83       151
Not Diabetic       0.85      0.75      0.80       149

    accuracy                           0.81       300
   macro avg       0.82      0.81      0.81       300
weighted avg       0.82      0.81      0.81       300

[[132  19]
 [ 37 112]]
