# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [2]:
# import pandas and load cancer data
import pandas as pd
cancer = pd.read_csv("cancer_data_means.csv")
# check which columns have missing values
cancer.isnull().any()

id                        False
diagnosis                 False
radius_mean               False
texture_mean               True
perimeter_mean            False
area_mean                 False
smoothness_mean            True
compactness_mean          False
concavity_mean            False
concave_points_mean       False
symmetry_mean              True
fractal_dimension_mean    False
dtype: bool

In [3]:
# use the mean to fill in missing values
texture_mean = cancer['texture_mean'].mean()
cancer['texture_mean'].fillna(texture_mean, inplace=True)

smooth_mean = cancer['smoothness_mean'].mean()
cancer['smoothness_mean'].fillna(smooth_mean, inplace=True)

sym_mean = cancer['symmetry_mean'].mean()
cancer['symmetry_mean'].fillna(sym_mean, inplace=True)

# confirm your correction 
cancer.isnull().any()

id                        False
diagnosis                 False
radius_mean               False
texture_mean              False
perimeter_mean            False
area_mean                 False
smoothness_mean           False
compactness_mean          False
concavity_mean            False
concave_points_mean       False
symmetry_mean             False
fractal_dimension_mean    False
dtype: bool

In [4]:
# how many duplicates are there ?
sum(cancer.duplicated())

5

In [5]:
# drop duplicates
cancer.drop_duplicates(inplace=True)

In [6]:
# confirm correction by rechecking for duplicates in the data
sum(cancer.duplicated())

0

## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [7]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)
cancer.columns = cancer.columns.str.replace("_mean", "")
cancer.isnull().any()

id                   False
diagnosis            False
radius               False
texture              False
perimeter            False
area                 False
smoothness           False
compactness          False
concavity            False
concave_points       False
symmetry             False
fractal_dimension    False
dtype: bool

In [8]:
# display first few rows of the dataframe to confirm changes
cancer.head()

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry,fractal_dimension
0,842302,M,17.99,19.293431,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,0.096087,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [11]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
cancer.to_csv("cancer_data_edited.csv", index=False)

#to ignore the removed rows 