Importing Dependencies

In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

Loading and cleaning the Data

In [12]:
data = pd.read_csv(r'C:\Users\user\ A DATA SCIENCE 101\Breast Cancer Wisconsin\Model\data.csv')

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [14]:
def clean_data():
    
    data = pd.read_csv(r'C:\Users\user\ A DATA SCIENCE 101\Breast Cancer Wisconsin\Model\data.csv')
    
    data = data.drop(['id', 'Unnamed: 32'], axis=1)
    
    data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
    
    return data


    
    

In [15]:
def main():
    data = clean_data()
    print(data.head())
main()

   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2419  ...         25.38          

Creating the model(Scaling, Training and Testing)

In [18]:
def creating_the_model():
    
    data = pd.read_csv(r'C:\Users\user\ A DATA SCIENCE 101\Breast Cancer Wisconsin\Model\data.csv')
    
    data = data.drop(['id', 'Unnamed: 32'], axis=1)
    
    data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
    
    X = data.drop(['diagnosis'], axis=1)
    Y = data['diagnosis']
    
    return X, Y
    
  

In [19]:
def main():
    X, Y = creating_the_model()
    print('Y features')
    print(Y.head())
    
    print('X features')
    print(X.head())
    
main()

Y features
0    1
1    1
2    1
3    1
4    1
Name: diagnosis, dtype: int64
X features
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   fr

In [20]:

def creating_the_model():
    
    data = pd.read_csv(r'C:\Users\user\ A DATA SCIENCE 101\Breast Cancer Wisconsin\Model\data.csv')
    
    data = data.drop(['id', 'Unnamed: 32'], axis=1)
    
    data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
    
    X = data.drop(['diagnosis'], axis=1)
    Y = data['diagnosis']
    
    #scaling the data
    Scaler = StandardScaler()
    X = Scaler.fit_transform(X)
    
    #splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2,random_state=42)
    
    #training the data
    model = LogisticRegression()
    model.fit(X_train, Y_train)
    
    #testing the model
    predict = model.predict(X_test)
    
    #Accuracy
    Accuracy = accuracy_score(Y_test, predict)
    print('The accuracy of our model is:', Accuracy)
    
    #Report
    class_report = classification_report(Y_test, predict)
    print('The classification report:', class_report)
    
    return model, Scaler
    

In [21]:
def main():
    model, Scaler = creating_the_model()
    
main()


The accuracy of our model is: 0.9736842105263158
The classification report:               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



Exporting the model and scaler

In [22]:
import pickle

In [24]:
def main():
    model, Scaler = creating_the_model()
    
    with open(r'C:\Users\user\ A DATA SCIENCE 101\Breast Cancer Wisconsin\Model/model.pkl', 'wb') as f:
        pickle.dump(model, f)
        
    with open(r'C:\Users\user\ A DATA SCIENCE 101\Breast Cancer Wisconsin\Model/Scaler.pkl', 'wb') as f:
        pickle.dump(Scaler, f)
    
main()


The accuracy of our model is: 0.9736842105263158
The classification report:               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

