# pandas practice

In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [158]:
dia = pd.read_csv('data/diabetes_data.csv', index_col=False)
display(dia.head(5))
display(dia.tail(5))
display(dia.info())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
0,6,98,58,33,190,34.0,0.43,43,0,Female
1,2,112,75,32,0,35.7,0.148,21,0,Female
2,2,108,64,0,0,30.8,0.158,21,0,Female
3,8,107,80,0,0,24.6,0.856,34,0,Female
4,7,136,90,0,0,29.9,0.21,50,0,Female


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Gender
773,6,103,72,32,190,37.7,0.324,55,0,Female
774,1,71,48,18,76,20.4,0.323,22,0,Female
775,0,117,0,0,0,33.8,0.932,44,0,Female
776,4,154,72,29,126,31.3,0.338,37,0,Female
777,5,147,78,0,0,33.7,0.218,65,0,Female


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 778 entries, 0 to 777
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               778 non-null    int64  
 1   Glucose                   778 non-null    int64  
 2   BloodPressure             778 non-null    int64  
 3   SkinThickness             778 non-null    int64  
 4   Insulin                   778 non-null    int64  
 5   BMI                       778 non-null    float64
 6   DiabetesPedigreeFunction  778 non-null    float64
 7   Age                       778 non-null    int64  
 8   Outcome                   778 non-null    int64  
 9   Gender                    778 non-null    object 
dtypes: float64(2), int64(7), object(1)
memory usage: 60.9+ KB


None

### find duplicates is data

In [159]:
dup_col = list(dia.columns)
dup_col.remove('Gender')
mask = dia.duplicated(subset=dup_col)
display(f'number of duplicates - {dia[mask].shape[0]}')

'number of duplicates - 10'

In [160]:
dia = dia.drop_duplicates(subset=dup_col)
dia.info()

<class 'pandas.core.frame.DataFrame'>
Index: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
 9   Gender                    768 non-null    object 
dtypes: float64(2), int64(7), object(1)
memory usage: 66.0+ KB


In [161]:
for col in dia.columns:
    same_ratio = dia[col].value_counts(normalize=True).max()
    unique_ratio = dia[col].nunique() / dia[col].count()
    
    if same_ratio > 0.95:
        print(f'Feature-{col} has same values-{round(same_ratio*100,2)}')
        dia = dia.drop(col, axis=1)
    if unique_ratio > 0.95:
         print(f'Feature-{col} has unique values-{round(unique_ratio*100,2)}')
         dia = dia.drop(col, axis=1)
dia.head(5)

Feature-Gender has same values-100.0


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58,33,190,34.0,0.43,43,0
1,2,112,75,32,0,35.7,0.148,21,0
2,2,108,64,0,0,30.8,0.158,21,0
3,8,107,80,0,0,24.6,0.856,34,0
4,7,136,90,0,0,29.9,0.21,50,0


### find NaN values in data

In [162]:
has_null = [col for col in dia.columns if dia[col].all() == 0]
print(has_null)
has_null.remove('Outcome')
print(has_null)
for col in has_null:
    dia[col] = dia[col].apply(lambda x: np.nan if x == 0 else x)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Outcome']
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']


In [166]:
thresh = dia.shape[0]*0.7
dia = dia.dropna(thresh=thresh, axis=1)
print(dia.shape[1])
dia.head()

8


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,98.0,58.0,33.0,34.0,0.43,43,0
1,2.0,112.0,75.0,32.0,35.7,0.148,21,0
2,2.0,108.0,64.0,,30.8,0.158,21,0
3,8.0,107.0,80.0,,24.6,0.856,34,0
4,7.0,136.0,90.0,,29.9,0.21,50,0


In [165]:
m = dia.shape[1]
dia = dia.dropna(thresh=m-2, axis=0)
print(dia.shape[0])

752


In [168]:
has_nan = [col for col in dia.columns if dia[col].isna().any()]
for col in has_nan:
    dia[col] = dia[col].fillna(dia[col].median())
dia['SkinThickness'].mean()

np.float64(29.11037234042553)

In [169]:
has_nan = [col for col in dia.columns if dia[col].isna().any()]
has_nan

[]

### find anomaly values 

In [174]:
def boxplot_function(data, column, log_scale=False, left=1.5, right=1.5):
    if log_scale:
        df = np.log(data[column])
    else:
        df = data[column]
    q_25 = np.quantile(df, 0.25)
    q_75 = np.quantile(df, 0.75)
    iqr = q_75 - q_25
    left_bound = q_25-left * iqr
    right_bound = q_75+right * iqr
    outliers = data[(df < left_bound) | (df > right_bound)]
    cleaned = data[(df >= left_bound) & (df <= right_bound)]
    return outliers, cleaned

In [176]:
outliers, cleaned = boxplot_function(data=dia, column='SkinThickness', log_scale=False, left=1.5, right=1.5)
outliers.shape[0]

81

In [178]:
def outliers_z_score(data, column, log_scale=False):
    if log_scale:
        x = np.log(data[column]+1)
    else:
        x = data[column]
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - 3 * sigma
    upper_bound = mu + 3 * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x >= lower_bound) & (x <= upper_bound)]
    return outliers, cleaned

outliers, cleaned = outliers_z_score(data=dia, column='SkinThickness', log_scale=False)
outliers.shape[0]

4

In [None]:
def outliers_iqr_mod(data, feature, log_scale=False, left=3, right=3):
    if log_scale:
        df = np.log(data[feature])
    else:
        df = data[feature]
    
    quartile_25, quartile_75 = df.quantile(0.25), df.quantile(0.75)
    iqr = quartile_75 - quartile_25
    lower_bound = quartile_25 - (iqr * left)
    upper_bound = quartile_75 + (iqr * right)
    outliers = data[(df < lower_bound) | (df > upper_bound)]
    cleaned = data[(df >= lower_bound) & (df <= upper_bound)]
    return outliers, cleaned

outliers_c, cleaned_c = outliers_iqr_mod(data=dia, feature='DiabetesPedigreeFunction', log_scale=False, left=1.5, right=1.5)
outliers_l, cleaned_l = outliers_iqr_mod(data=dia, feature='DiabetesPedigreeFunction', log_scale=True, left=1.5, right=1.5)
print(outliers_c.shape[0] - outliers_l.shape[0])

29
