# Cleaning Data

## Data Type Constraints

In [9]:
import pandas as pd
df = pd.read_csv('../Datasets/Sleep_health_and_lifestyle_dataset.csv')
#~ Find duplicates using .duplicated()
column_names = ['Occupation','Person ID','Heart Rate']
duplicates = df.duplicated(subset=column_names, keep = False)
df[duplicates] #* No duplicates found


Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder


## Text and Categorical Data Problems

In [25]:

#~ Membership Constraints - Encodes categorical variables as numbers
#& Anti Join A and B - In A but not in B
#& Inner Join A and B - Intersection of A and B
#* Find incosistent catgegories
categories = pd.DataFrame({'Gender':['Male','Female']})
inconsistent_categories = set(df['Gender']).difference(categories['Gender'])
inconsistent_rows = df['Gender'].isin(inconsistent_categories)
df[inconsistent_rows]

#~ Categorical variables
#& Value consistency - Format consistently among all columns
df[list(df.columns)].unique()

#! DO NOT RUN
# Create ranges for categories
label_ranges = [0, 60, 180, np.inf]
label_names = ['short', 'medium', 'long']
# Create wait_type column
airlines['wait_type'] = pd.cut(airlines['wait_min'], bins = label_ranges, 
                                labels = label_names )
# Create mappings and replace
mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday', 
            'Thursday': 'weekday', 'Friday': 'weekday', 
            'Saturday': 'weekend', 'Sunday': 'weekend'}
airlines['day_week'] = airlines['day'].replace(mappings)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder


##  Advanced Data Problems

In [None]:

#~ Uniformity
#~! DO NOT RUN
# Print the header of account_opend
print(banking['account_opened'].head())
# Convert account_opened to datetime
banking['account_opened'] = pd.to_datetime(banking['account_opened'],
                                           # Infer datetime format
                                           infer_datetime_format = True,
                                           # Return missing value for error
                                           errors = 'coerce') 
# Get year of account opened
banking['acct_year'] = banking['account_opened'].dt.strftime('%Y')
# Print acct_year
print(banking['acct_year'])

#~ Completeness
#! DO NOT RUN
import missingno as msno
import matplotlib.pyplot as plt
# Print number of missing values in banking
print(banking.isna().sum())
# Visualize missingness matrix
msno.matrix(banking)
plt.show()
# Isolate missing and non missing values of inv_amount
missing_investors = banking[banking['inv_amount'].isna()]
investors = banking[banking['inv_amount'].notna()]
# Sort banking by age and visualize
banking_sorted = banking.sort_values('age')
msno.matrix(banking_sorted)
plt.show()

## Record Linkage

In [None]:

#~ Comparing Strings
#! DO NOT RUN
# Import process from thefuzz
from thefuzz import process
# Store the unique values of cuisine_type in unique_types
unique_types = restaurants['cuisine_type'].unique()
# Calculate similarity of 'asian' to all values of unique_types
print(process.extract('asian', unique_types, limit = len(unique_types)))
# Calculate similarity of 'american' to all values of unique_types
print(process.extract('american', unique_types, limit = len(unique_types)))
# Calculate similarity of 'italian' to all values of unique_types
print(process.extract('italian', unique_types, limit = len(unique_types)))

#~ Remapping categoires
# Iterate through categories
for cuisine in categories:  
  # Create a list of matches, comparing cuisine with the cuisine_type column
  matches = process.extract(cuisine, restaurants['cuisine_type'], limit=len(restaurants.cuisine_type))
  # Iterate through the list of matches
  for match in matches:
     # Check whether the similarity score is greater than or equal to 80
    if match[1] >= 80:
      # If it is, select all rows where the cuisine_type is spelled this way, and set them to the correct cuisine
      restaurants.loc[restaurants['cuisine_type'] == match[0]] = cuisine   
# Inspect the final result
print(restaurants['cuisine_type'].unique())

#~ Record Linkage - Joining two records where their common column are formatted differently
# Create an indexer and object and find possible pairs
indexer = recordlinkage.Index()
# Block pairing on cuisine_type
indexer.block('cuisine_type')
# Generate pairs
pairs = indexer.index(restaurant, restaurant_new)

# Create a comparison object
comp_cl = recordlinkage.Compare()
# Find exact matches on city, cuisine_types - 
comp_cl.exact('city', 'city', label='city')
comp_cl.exact('cuisine_type', 'cuisine_type', label='cuisine_type')
# Find similar matches of rest_name
comp_cl.string('rest_name', 'rest_name', label='name', threshold = 0.8) 
# Get potential matches and print
potential_matches = comp_cl.compute(pairs, restaurants, restaurants_new)
print(potential_matches)

#~ Linking DataFrames
# Isolate potential matches with row sum >=3
matches = potential_matches[potential_matches.sum(axis=1) >= 3]
# Get values of second column index of matches
matching_indices = matches.index.get_level_values(1)
# Subset restaurants_new based on non-duplicate values
non_dup = restaurants_new[~restaurants_new.index.isin(matching_indices)]
# Append non_dup to restaurants
full_restaurants = restaurants.append(non_dup)
print(full_restaurants)