# Cleaning The Project Data Files

### Cleaning The phone_use_data.csv File

In [3]:
import pandas as pd

# Read the phone use data CSV file
df = pd.read_csv('phone_use_data.csv')

df.drop([195, 196 ,197], inplace=True)


# Function to check if a time duration is valid (hours < 24, minutes < 60, seconds < 60)
def is_valid_time(time_str):
    if not pd.isnull(time_str):
        hours, minutes, seconds = map(int, time_str.split(':'))
        return hours < 24 and minutes < 60 and seconds < 60
    return True  # Treat NaN as valid

# Function to check if the check phone count is within a reasonable range
def is_valid_check_count(check_count):
    if not pd.isnull(check_count):
        return 25 <= int(check_count) <= 105
    return True  # Treat NaN as valid

# Identify rows with incorrect time values
invalid_time_rows = df[~df['Screen on time'].apply(is_valid_time)]

# Drop rows with incorrect time values
df.drop(invalid_time_rows.index, inplace=True)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('phone_use_data_cleaned.csv', index=False)


### Cleaning The app_usage.csv File

In [4]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('app_usage.csv')

# Replace certain app names with standardized names
df['App name'] = df['App name'].replace('வரைபடம்', 'Map Application')
df['App name'] = df['App name'].replace('WhatsApp\xa0Business', 'WhatsApp Business')

# Filter out rows where 'Access count' is less than or equal to 3
df = df[df['Access count'] > 3]

# Function to convert time duration string to total seconds
def time_to_seconds(time_str):
    hours, minutes, seconds = map(float, time_str.split(':'))
    return hours * 3600 + minutes * 60 + seconds

# Calculate average screen time in seconds
df['Average screen time (seconds)'] = df['Usage time'].apply(time_to_seconds) / df['Access count']

# Define a threshold for low average screen time, for example, 30 seconds
threshold = 30

# Filter out apps with low average screen time
df = df[df['Average screen time (seconds)'] > threshold]

# Convert average screen time back to string format (HH:MM:SS)
df['Average screen time'] = df['Average screen time (seconds)'].apply(lambda s: f"{int(s//3600)}:{int((s%3600)//60)}:{int(s%60):02d}")

# Drop the 'Average screen time (seconds)' column as it's no longer needed
df = df.drop(columns=['Average screen time (seconds)'])

# Write the cleaned DataFrame to a new CSV file
df.to_csv('app_usage_cleaned.csv', index=False)
