In [2]:
# Importing Packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from statsmodels.formula.api import ols, logit


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from statsmodels.graphics.mosaicplot import mosaic

#### 1. Look up SMOTE oversampling https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html 
#####    a. Describe what it is in your own words in markdown.    
#####    b. Use this technique with the diabetes dataset. Comment on the model performance compared to other methods. Make sure you are clear about why you chose the performance metric you did.

 * Synthetic Minority Over-sampling Technique is a method for oversampling the minosity class data.
 * It uses KNN classification methos in the background to resample the data.
 * It takes sampling strategy as an input guide for oversampling. 
     - Sampling strategy takes in float, str of dict as input.
     - For binary classification float value is passed as the ratio of minority class data to majority class data in the resampled dataset.

In [23]:
diabetes_df = pd.read_csv("../Datasets/diabetes.csv")

display(diabetes_df.head())

X=diabetes_df.drop("Outcome",axis=1)
y=diabetes_df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0, stratify = y)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [24]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

# Resampling using SMOTE 

sm = SMOTE(sampling_strategy = 0.8, random_state=0)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [25]:
# Instantiating logistic regression classifier
logreg = LogisticRegression()

mdl_outcome = logreg.fit(X_train, y_train)

y_pred = mdl_outcome.predict(X_test)

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.87      0.57      0.83      0.70      0.51       150
          1       0.70      0.57      0.87      0.63      0.70      0.48        81

avg / total       0.76      0.76      0.67      0.76      0.70      0.50       231



* As true positives are important to diabetes classification. We look at the recall/sensetivity parameter.
* Resampling with SMOTE and then running the logistic regression imporoved the recall value from 0.35 to 0.57

#### 2. Create a function called rec_digit_sum that takes in an integer. This function is the recursive sum of all the digits in a number.

Given n, take the sum of all the digits in n. If the resulting value has more than one digit,
continue calling the function in this way until a single-digit number is produced. The input
will be a non-negative integer, and this should work for extremely large values as well as
for single-digit inputs.

Examples:

16 --> 1 + 6 = 7

942 --> 9 + 4 + 2 = 15 --> 1 + 5 = 6

132189 --> 1 + 3 + 2 + 1 + 8 + 9 = 24 --> 2 + 4 = 6

493193 --> 4 + 9 + 3 + 1 + 9 + 3 = 29 --> 2 + 9 = 11

In [63]:
def rec_digit_sum(n):
    """Returning recursive sum of digits of a number"""
    try:
        digit = [int(x) for x in str(n)]
        res = sum(digit)
        res_dig = [int(x) for x in str(res)]
        if len(res_dig) > 1:
            rec_digit_sum(res)
        else:
            print(res)
    except ValueError:
        print("This is not a positive integer!")

In [47]:
rec_digit_sum(16)

7


In [48]:
rec_digit_sum(942)

6


In [49]:
rec_digit_sum(132189)

6


In [50]:
rec_digit_sum(493193)

2


In [66]:
rec_digit_sum("ok")

This is not a positive integer!


In [67]:
rec_digit_sum(12.45)

This is not a positive integer!


In [68]:
rec_digit_sum(-145)

This is not a positive integer!
