# **Data Cleaning**: Pandas help to clean DataFrames efficiently.           
cleaning and pre processing are essential steps in Data analysis.

# Handling Missing Data. 
1. use of df.isna() or df.isnull() to identify missing values and df.isna().sum() shows how many missing values.
2. Handle: df.dropna() removes rows with any missing values.. use of df.dropna(subset=["column_name"]) to target specific conditions.
3. df.fillna(value) replaces missing values with a constant (eg df.fillna(0)). for smarter fills, use df.fillna(method="ffill") (forward fill) or df.fillna(df.mean()) to use column means.

Missing values can skew results. knowing how to detect and either drop or imute them covers most scenarios. 

In [94]:
import pandas as pd 
penguins_data=pd.read_csv("..\Datasets\penguins.csv")
penguins_data.isnull().sum() # to count all the missing values. 

rowid                 0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [95]:
# repalace missing values in column
penguins_data["sex"]=penguins_data["sex"].fillna("female")
penguins_data.isnull().sum()

rowid                0
species              0
island               0
bill_length_mm       2
bill_depth_mm        2
flipper_length_mm    2
body_mass_g          2
sex                  0
year                 0
dtype: int64

In [96]:
penguins_data=pd.read_csv("..\Datasets\penguins.csv")
penguins_data=penguins_data.dropna(subset=["sex"])
penguins_data.shape

# after dropping only 333 records remained. others are dropped because they had null values. 

(333, 9)

In [97]:
penguins_data=pd.read_csv("..\Datasets\penguins.csv")
median_flipper=penguins_data["flipper_length_mm"].median()
median_flipper

# it is 197mm. 

np.float64(197.0)

In [98]:
# now filling with median_flipper
penguins_data["flipper_length_mm"]=penguins_data["flipper_length_mm"].fillna(median_flipper)
penguins_data

# now where values were null it is filled with 197mm 

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,197.0,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


# Finding and Removing Duplicates:
Duplicate rows can inflate data and bias analysis. 
1. Detect: df.duplicated() flags duplicate rows (returns True/false). use df.duplicated.sum() to count them.
2. Remove: df.drop_duplicates() removes duplicates rows, keeping the fist occurence by default. 
Duplicates are common in real world data. (eg from merging datasets). removing them ensures your analysis reflects unique records, critical for accurate counts or aggregations. 

In [99]:
import pandas as pd
penguins_data=pd.read_csv("..\Datasets\penguins.csv")
penguins_data.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
339    False
340    False
341    False
342    False
343    False
Length: 344, dtype: bool

In [100]:
# since penguins.csv doesnot contains duplicates and it is clean i will create a dataframe which have  duplicate entries. 
import pandas as pd

# Create a sample DataFrame with duplicate rows
data = {
    "species": ["Adelie", "Adelie", "Chinstrap", "Gentoo", "Gentoo", "Adelie", "Chinstrap", "Gentoo"],
    "island": ["Torgersen", "Torgersen", "Dream", "Biscoe", "Biscoe", "Torgersen", "Dream", "Biscoe"],
    "bill_length_mm": [39.1, 39.1, 46.5, 50.0, 50.0, 39.1, 46.5, 50.0],
    "bill_depth_mm": [18.7, 18.7, 17.9, 15.3, 15.3, 18.7, 17.9, 15.3],
    "flipper_length_mm": [181, 181, 195, 220, 220, 181, 195, 220],
    "body_mass_g": [3750, 3750, 3800, 5000, 5000, 3750, 3800, 5000],
    "sex": ["Male", "Male", "Female", "Male", "Male", "Male", "Female", "Male"]
}

penguins_duplicate = pd.DataFrame(data)
penguins_duplicate

# this dataframe have 8 records and contains duplicates.

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,Male
1,Adelie,Torgersen,39.1,18.7,181,3750,Male
2,Chinstrap,Dream,46.5,17.9,195,3800,Female
3,Gentoo,Biscoe,50.0,15.3,220,5000,Male
4,Gentoo,Biscoe,50.0,15.3,220,5000,Male
5,Adelie,Torgersen,39.1,18.7,181,3750,Male
6,Chinstrap,Dream,46.5,17.9,195,3800,Female
7,Gentoo,Biscoe,50.0,15.3,220,5000,Male


In [101]:
# to identify duplicate rows
duplicate_rows=penguins_duplicate.duplicated()
# from this we can find that 1,4,5,6 records are duplicates.
duplicate_rows

0    False
1     True
2    False
3    False
4     True
5     True
6     True
7     True
dtype: bool

In [102]:
duplicate_rows=penguins_duplicate[penguins_duplicate.duplicated()]
duplicate_rows
# it allows us to see duplicate rows and help to handle them. 

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
1,Adelie,Torgersen,39.1,18.7,181,3750,Male
4,Gentoo,Biscoe,50.0,15.3,220,5000,Male
5,Adelie,Torgersen,39.1,18.7,181,3750,Male
6,Chinstrap,Dream,46.5,17.9,195,3800,Female
7,Gentoo,Biscoe,50.0,15.3,220,5000,Male


In [103]:
# remove them using drop_duplicates() methods.
clean_penguins=penguins_duplicate.drop_duplicates()
clean_penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,Male
2,Chinstrap,Dream,46.5,17.9,195,3800,Female
3,Gentoo,Biscoe,50.0,15.3,220,5000,Male


In [104]:
# in some cases duplicate might be in some rows or columns, when we want to specifically apply to them. 
import pandas as pd

data = {
    "species": ["Adelie", "Adelie", "Chinstrap", "Gentoo", "Gentoo", "Adelie", "Chinstrap", "Gentoo"],
    "island": ["Torgersen", "Torgersen", "Dream", "Biscoe", "Biscoe", "Torgersen", "Dream", "Biscoe"],
    "bill_length_mm": [39.1, 39.1, 46.5, 50.0, 48.0, 39.1, 45.0, 50.0],  # Note differences here
    "bill_depth_mm": [18.7, 18.5, 17.9, 15.3, 15.0, 18.7, 17.8, 15.3],   # Small differences here
    "flipper_length_mm": [181, 181, 195, 220, 221, 182, 195, 220],       # Slight variation
    "body_mass_g": [3750, 3750, 3800, 5000, 4900, 3750, 3800, 5000],    # Slight difference
    "sex": ["Male", "Female", "Female", "Male", "Male", "Male", "Female", "Male"]
}

df = pd.DataFrame(data)
print(df)


     species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0     Adelie  Torgersen            39.1           18.7                181   
1     Adelie  Torgersen            39.1           18.5                181   
2  Chinstrap      Dream            46.5           17.9                195   
3     Gentoo     Biscoe            50.0           15.3                220   
4     Gentoo     Biscoe            48.0           15.0                221   
5     Adelie  Torgersen            39.1           18.7                182   
6  Chinstrap      Dream            45.0           17.8                195   
7     Gentoo     Biscoe            50.0           15.3                220   

   body_mass_g     sex  
0         3750    Male  
1         3750  Female  
2         3800  Female  
3         5000    Male  
4         4900    Male  
5         3750    Male  
6         3800  Female  
7         5000    Male  


In [105]:
# using key attributes
df.drop_duplicates(subset=["bill_length_mm","body_mass_g"],keep="last") # it will keep last one and discard the first one.

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
2,Chinstrap,Dream,46.5,17.9,195,3800,Female
4,Gentoo,Biscoe,48.0,15.0,221,4900,Male
5,Adelie,Torgersen,39.1,18.7,182,3750,Male
6,Chinstrap,Dream,45.0,17.8,195,3800,Female
7,Gentoo,Biscoe,50.0,15.3,220,5000,Male


# Renaming and Reorder columns: it address clarity and organization of analysis, modeling etc. 

In [None]:
import pandas as pd
df = pd.read_csv("../Datasets/penguins.csv")
df.drop(index=0, inplace=True)  # removing column names to do further learnings. 
df.to_csv("../Datasets/no_header_penguins.csv", index=False, header=False) 

no_header_penguins=pd.read_csv("../Datasets/no_header_penguins.csv",
                            header=None,
                            names=["id","species","length","breadth","height","weight","sex","DOB"] #giving random names for columns for practise. 
) # give system generated header or column names  since we removed column names. 
no_header_penguins.head()

Unnamed: 0,id,species,length,breadth,height,weight,sex,DOB
2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,,,,,,2007
5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
6,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [None]:
import pandas as pd
df = pd.read_csv("../Datasets/penguins.csv")
df.columns

#lets say we want to change column name "sex" into "Gender". so we use rename methods.
penguins_renamed=df.rename(columns={"sex":"Gender"})
penguins_renamed

# the column name "sex" changed into "Gender". 

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Gender,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [116]:
# Reorder the columns: 
df.columns

Index(['rowid', 'species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='object')

In [118]:
#copying column names from df.columns and reordering them according to the needs
column_names_reordered=['rowid', 'species', 'sex',  'year', 'island', 'bill_length_mm', 'bill_depth_mm',
    'flipper_length_mm', 'body_mass_g']

column_names_reordered=df[column_names_reordered] # passing list where we have stored column names
column_names_reordered

Unnamed: 0,rowid,species,sex,year,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,1,Adelie,male,2007,Torgersen,39.1,18.7,181.0,3750.0
1,2,Adelie,female,2007,Torgersen,39.5,17.4,186.0,3800.0
2,3,Adelie,female,2007,Torgersen,40.3,18.0,195.0,3250.0
3,4,Adelie,,2007,Torgersen,,,,
4,5,Adelie,female,2007,Torgersen,36.7,19.3,193.0,3450.0
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,male,2009,Dream,55.8,19.8,207.0,4000.0
340,341,Chinstrap,female,2009,Dream,43.5,18.1,202.0,3400.0
341,342,Chinstrap,male,2009,Dream,49.6,18.2,193.0,3775.0
342,343,Chinstrap,male,2009,Dream,50.8,19.0,210.0,4100.0


In [121]:
# all column names in uppercase 
penguins_upper=df.rename(columns=str.upper).head(n=2)
penguins_upper

Unnamed: 0,ROWID,SPECIES,ISLAND,BILL_LENGTH_MM,BILL_DEPTH_MM,FLIPPER_LENGTH_MM,BODY_MASS_G,SEX,YEAR
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007


# **Fixing Datatypes and incosistent formulas** : 
Incorrect Datatypes (eg numbered stored as strings) or incosistent formats (eg dates vlikes "01/02/23" vs "2023-02-01") can break analysis.

1. check Types: df.dtypes shows each columns data type.
2. convert types: use df["column"].astype(type) (eg df["age"].astype(int)) to change types. 
3. Standardisze strings: for text, use df["column].str.lower to lowercase or df["column"].str.strip() to remove extra spaces. Replace incosistent values with df["column"].replace({"old_value":"new_value"}).  Example: If a "Date" column has mixed formats, df["Date"] = pd.to_datetime(df["Date"]) unifies them. If "Gender" has "M", "Male", "m", use df["Gender"].replace({"M": "Male", "m": "Male"}).
<br>
correct data types ensue operation (eg.. maths,sortings) work as expected. Standardizing formats prevents errors in grouping or filtering especially with text or dates. 

In [4]:
# Handling Datatypes 
import pandas as pd
data = {
    'species': ['Adelie', 'Gentoo', 'Chinstrap'],
    'birth_date': ['2018-07-01', '2019-01-20', '2017-12-15'],  # dates as strings initially
    'flipper_length_mm': [181, 210, 195],
    'body_mass_g': [3750, 5000, 3800]
}
df = pd.DataFrame(data)
df.dtypes
#or df.info()

species              object
birth_date           object
flipper_length_mm     int64
body_mass_g           int64
dtype: object

In [9]:
df["birth_date"]=pd.to_datetime(df["birth_date"])
#pd.to numeric (for string to number) as per out requirements.
df.dtypes

species                      object
birth_date           datetime64[ns]
flipper_length_mm             int64
body_mass_g                   int64
dtype: object

In [None]:
df["flipper_length_mm"]=df["flipper_length_mm"].astype(str) # converted integer into string.  
#we can do .astype("category") which are efficient for memory and processing. 
df.dtypes

species                      object
birth_date           datetime64[ns]
flipper_length_mm            object
body_mass_g                   int64
dtype: object

In [16]:
#we can do .astype("category") which are efficient for memory and processing. 
df["species"]=df["species"].astype("category")
df.dtypes

species                    category
birth_date           datetime64[ns]
flipper_length_mm            object
body_mass_g                   int64
dtype: object

In [17]:
# to count repeated values.
df.nunique()

species              3
birth_date           3
flipper_length_mm    3
body_mass_g          3
dtype: int64

In [None]:
# filter and sort based on date based on real life examples
import pandas as pd

# Sample DataFrame including elephants
data = {
    'species': ['Elephant', 'Penguin', 'Elephant', 'Lion'],
    'birth_date': pd.to_datetime(['2010-05-20', '2015-06-10', '2012-09-15', '2013-04-01']),
    'weight_kg': [5000, 30, 5200, 190]
}

df = pd.DataFrame(data)
df

Unnamed: 0,species,birth_date,weight_kg
0,Elephant,2010-05-20,5000
1,Penguin,2015-06-10,30
2,Elephant,2012-09-15,5200
3,Lion,2013-04-01,190


In [22]:
elephants=df[df["species"]=="Elephant"].sort_values(by="birth_date")
elephants

Unnamed: 0,species,birth_date,weight_kg
0,Elephant,2010-05-20,5000
2,Elephant,2012-09-15,5200
