In [1]:
import sqlite3
import pandas as pd

# Create an SQLite database in memory
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()

In [2]:
# Create the employees table
cursor.execute('''
CREATE TABLE employees (
    employee_id INT PRIMARY KEY,
    employee_name VARCHAR(50),
    salary DECIMAL(10, 2),
    hire_date VARCHAR(20),
    department VARCHAR(50)
);
''')

<sqlite3.Cursor at 0x1c0717466c0>

In [3]:
# Insert sample records
cursor.executemany('''
INSERT INTO employees (employee_id, employee_name, salary, hire_date, department) VALUES (?, ?, ?, ?, ?)
''', [
    (1, 'Amy West', 60000.00, '2021-01-15', 'HR'),
    (2, 'Ivy Lee', 75000.50, '2020-05-22', 'Sales'),
    (3, 'joe smith', 80000.75, '2019-08-10', 'Marketing'), 
    (4, 'John White', 90000.00, '2020-11-05', 'Finance'),
    (5, 'Jane Hill', 55000.25, '2022-02-28', 'IT'),
    (6, 'Dave West', 72000.00, '2020-03-12', 'Marketing'),
    (7, 'Fanny Lee', 85000.50, '2018-06-25', 'Sales'),
    (8, 'Amy Smith', 95000.25, '2019-11-30', 'Finance'),
    (9, 'Ivy Hill', 62000.75, '2021-07-18', 'IT'),
    (10, 'Joe White', 78000.00, '2022-04-05', 'Marketing'),
    (11, 'John Lee', 68000.50, '2018-12-10', 'HR'),
    (12, 'Jane West', 89000.25, '2017-09-15', 'Sales'),
    (13, 'Dave Smith', 60000.75, '2022-01-08', None),
    (14, 'Fanny White', 72000.00, '2019-04-22', 'IT'),
    (15, 'Amy Hill', 84000.50, '2020-08-17', 'Marketing'),
    (16, 'Ivy West', 92000.25, '2021-02-03', 'Finance'),
    (17, 'Joe Lee', 58000.75, '2018-05-28', 'IT'),
    (18, 'John Smith', 77000.00, '2019-10-10', 'HR'),
    (19, 'Jane Hill', 81000.50, '2022-03-15', 'Sales'),
    (20, 'Dave White', 70000.25, '2017-12-20', 'Marketing')
])

<sqlite3.Cursor at 0x1c0717466c0>

# Identify and handle any missing value.

In [4]:
# Query to identify rows with any NULL values in any column
cursor.execute('''
SELECT * FROM employees 
WHERE employee_id IS NULL OR 
      employee_name IS NULL OR 
      salary IS NULL OR 
      hire_date IS NULL OR 
      department IS NULL
''')

cursor.fetchall()


[(13, 'Dave Smith', 60000.75, '2022-01-08', None)]

In [5]:
# For 'department', find the most frequent non-NULL value
cursor.execute('''
SELECT department, COUNT(department) AS freq 
FROM employees 
WHERE department IS NOT NULL 
GROUP BY department 
ORDER BY freq DESC 
LIMIT 1
''')
most_common_department = cursor.fetchone()[0]

# Fill missing values in the 'department' column with the most common value
cursor.execute('''
UPDATE employees
SET department = ?
WHERE department IS NULL
''', (most_common_department,))

# Display the updated dataset
updated_df = pd.read_sql_query('SELECT * FROM employees', conn)

updated_df

Unnamed: 0,employee_id,employee_name,salary,hire_date,department
0,1,Amy West,60000.0,2021-01-15,HR
1,2,Ivy Lee,75000.5,2020-05-22,Sales
2,3,joe smith,80000.75,2019-08-10,Marketing
3,4,John White,90000.0,2020-11-05,Finance
4,5,Jane Hill,55000.25,2022-02-28,IT
5,6,Dave West,72000.0,2020-03-12,Marketing
6,7,Fanny Lee,85000.5,2018-06-25,Sales
7,8,Amy Smith,95000.25,2019-11-30,Finance
8,9,Ivy Hill,62000.75,2021-07-18,IT
9,10,Joe White,78000.0,2022-04-05,Marketing


# Check for and eliminate any duplicate rows in the dataset.

In [6]:
# Check for and eliminate any duplicate rows in the dataset.

cursor.execute('''
    SELECT employee_name, COUNT(*) AS num_duplicates
    FROM employees
    GROUP BY employee_id, employee_name, salary, hire_date, department
    HAVING num_duplicates > 1''').fetchall()

[]

# Correct any structural issues, such as inconsistent naming conventions or formatting errors.

In [7]:
# Сapitalize the first letter of both the first name and surname
cursor.execute('''
UPDATE employees
SET employee_name = 
    UPPER(SUBSTR(employee_name, 1, 1)) || 
    LOWER(SUBSTR(employee_name, 2, INSTR(employee_name, ' ') - 1)) || 
    ' ' || 
    UPPER(SUBSTR(employee_name, INSTR(employee_name, ' ') + 1, 1)) || 
    LOWER(SUBSTR(employee_name, INSTR(employee_name, ' ') + 2))
''')

# Display the updated dataset to ensure names are corrected
pd.read_sql_query('SELECT * FROM employees', conn)



Unnamed: 0,employee_id,employee_name,salary,hire_date,department
0,1,Amy West,60000.0,2021-01-15,HR
1,2,Ivy Lee,75000.5,2020-05-22,Sales
2,3,Joe Smith,80000.75,2019-08-10,Marketing
3,4,John White,90000.0,2020-11-05,Finance
4,5,Jane Hill,55000.25,2022-02-28,IT
5,6,Dave West,72000.0,2020-03-12,Marketing
6,7,Fanny Lee,85000.5,2018-06-25,Sales
7,8,Amy Smith,95000.25,2019-11-30,Finance
8,9,Ivy Hill,62000.75,2021-07-18,IT
9,10,Joe White,78000.0,2022-04-05,Marketing


# Ensure all columns have appropriate data types (e.g. the hire_date column).

In [8]:
# Step 1: Create a new table with the desired schema
cursor.execute('''
CREATE TABLE new_employees (
    employee_id INT PRIMARY KEY,
    employee_name VARCHAR(50),
    salary REAL,  -- Changed to REAL for floating-point numbers
    hire_date DATE,  -- Changed to DATE for proper date storage
    department VARCHAR(50)
);
''')

# Step 2: Copy data from the old table to the new table with necessary transformations
cursor.execute('''
INSERT INTO new_employees (employee_id, employee_name, salary, hire_date, department)
SELECT 
    employee_id,
    employee_name,
    CAST(salary AS REAL),  -- Convert salary to REAL (float)
    DATE(hire_date),       -- Convert hire_date to proper DATE format
    department
FROM employees;
''')

# Step 3: Drop the old table
cursor.execute('DROP TABLE employees;')

# Step 4: Rename the new table to the original table name
cursor.execute('ALTER TABLE new_employees RENAME TO employees;')

<sqlite3.Cursor at 0x1c0717466c0>

# Detect and address any outliers that may skew the analysis.

In [9]:
# Detect outliers based on salary using the IQR method
df = pd.read_sql_query('SELECT * FROM employees', conn)

# Calculate Q1, Q3, and IQR
q1 = df['salary'].quantile(0.25)
q3 = df['salary'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Detect outliers
outliers = df[(df['salary'] < lower_bound) | (df['salary'] > upper_bound)]

# Filter data to remove outliers
data_without_outliers = df[(df['salary'] >= lower_bound) & (df['salary'] <= upper_bound)]

data_without_outliers

Unnamed: 0,employee_id,employee_name,salary,hire_date,department
0,1,Amy West,60000.0,2021-01-15,HR
1,2,Ivy Lee,75000.5,2020-05-22,Sales
2,3,Joe Smith,80000.75,2019-08-10,Marketing
3,4,John White,90000.0,2020-11-05,Finance
4,5,Jane Hill,55000.25,2022-02-28,IT
5,6,Dave West,72000.0,2020-03-12,Marketing
6,7,Fanny Lee,85000.5,2018-06-25,Sales
7,8,Amy Smith,95000.25,2019-11-30,Finance
8,9,Ivy Hill,62000.75,2021-07-18,IT
9,10,Joe White,78000.0,2022-04-05,Marketing
