# Data Cleaning In Python

### Common Issues With Data
+ Reading the file
+ Inconsistent Column Names
+ Missing Data
+ Different Data Types 
+ Duplicate rows
+ etc


In [None]:
# EDA packages
import pandas as pd
import numpy as np

## Loading or Reading the File
+ Encoding Error
+ Inconsistent rows  

In [None]:
##Changed directory to read the datasets 
import os
os.chdir("DataSet")
print("current working directory",os.getcwd())

In [None]:
# Issue 1
df = pd.read_csv("unclean_data.csv")

In [None]:
# Solution 1
# UTF Encoding
df1 = pd.read_csv("unclean_data.csv",encoding='latin1')

In [None]:
df1.head()

In [None]:
# Solution 2
# Use Text Editor and Save it as Utf-8,ISO-8859-1,latin1
df = pd.read_csv("unclean_data1.csv",encoding='utf8')

In [None]:
df.head()

## Inconsistent Column Names
+ Change Cases
+ Rename them

### Change the case to Upper


In [None]:
df.columns

In [None]:
df.columns.str.upper()

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.upper()

In [None]:
df.columns

### Renaming Columns

In [None]:
df.rename(columns = {'DURATION':'TIME'})

## Missing Data
+ Add a default value for missing data or use mean to fill it
+ Delete the row/column with missing data
+ Interpolate the rows
+ Replace

#### To check for missing data
#### False means no missing data
+ df.isnull().sum() int
+ df.isnull().any() bool

In [None]:
df.isnull()

In [None]:
df.isnull().any()

In [None]:
# Columns with NAN using True/False
# False means it doesn't have a NAN
df.isnull().any()

In [None]:
# For entire DataFrame
df.isnull().any().any()

In [None]:
# Columns with NAN using Integer
df.isnull().sum()

In [None]:
# Total Number of Missing NA
df.isnull().sum().sum()

### Adding A Default Value or Filling the Missing Data

In [None]:
df.head()

In [None]:
df_with_0 = df.fillna(0)

In [None]:
df_with_0.head()

#### Fill it with the mean

In [None]:
# Fill it with the mean
df['DURATION'].mean()

In [None]:
df_with_mean = df.DURATION.fillna(df['DURATION'].mean())

In [None]:
df_with_mean

### Droping NA

In [None]:
## Droping NA
df.head()

In [None]:
df.isnull().sum().sum()

In [None]:
df.shape

In [None]:
df_drop = df.dropna()

In [None]:
df_drop.shape

In [None]:
df_drop.head()

In [None]:
?df.dropna()

In [None]:
df_drop_with_condition = df.dropna(thresh=2)

In [None]:
df_drop_with_condition.shape

In [None]:
df.shape

In [None]:
df_drop_column = df.dropna(axis=1)

In [None]:
df_drop_column.shape

In [None]:
THANKS A LOT

# DATA CLEANING IN PYTHON

## Dropping Duplicates
 + drop_duplicates()
 + keep='first'

In [None]:
df = pd.read_csv("unclean_data1.csv",encoding='utf8')

In [None]:
df.head(10)

In [None]:
df.duplicated()

In [None]:
df.duplicated('MOVIE_TITLE')

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df_drop_dup = df.drop_duplicates('MOVIE_TITLE')

In [None]:
df_drop_dup


## Data Types Inconsistencies
 + Change datatype after reading the csv
 + Change datatype before reading the csv
 ++  pd.read_csv(url, dtype={'column1':float})

In [None]:
df = pd.read_csv('file.csv', dtype={'column1':float})

In [None]:
df.dtypes

In [None]:
df.GROSS.dtypes

In [None]:
?df.astype

In [None]:
df.GROSS.astype(float).dtypes

In [None]:
THANKS A LOT