In [37]:
# Step 1: Install the required package
# pandasql is useful for running SQL queries on Pandas DataFrames, ensuring smooth data manipulation.
!pip install pandasql

# Step 2: Mount Google Drive
# This step ensures that we can access files stored in Google Drive.
from google.colab import drive
drive.mount('/content/drive')

# Step 3: Load the CSV file into a Pandas DataFrame
import pandas as pd
import sqlite3  # SQLite will be used for executing SQL queries

# Define the path to the raw data file
file_path = '/content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_for_Data_Cleaning.csv'

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Set Pandas display options to show more rows and all columns for better visibility
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

# Step 4: Create an in-memory SQLite database
# This allows us to execute SQL queries as if working with a relational database.
conn = sqlite3.connect(":memory:")

# Load the DataFrame into an SQLite table named 'layoffs'
df.to_sql("layoffs", conn, index=False, if_exists="replace")

# Step 5: Create layoffs_staging table
# The layoffs_staging table will hold a copy of the original data for cleaning.
query_create_table = "CREATE TABLE layoffs_staging AS SELECT * FROM layoffs WHERE 1=0;"
conn.execute(query_create_table)

# Step 6: Insert data into layoffs_staging
query_insert = "INSERT INTO layoffs_staging SELECT * FROM layoffs;"
conn.execute(query_insert)

# Step 7: Add a new column `row_num` to identify duplicate rows
conn.execute("ALTER TABLE layoffs_staging ADD COLUMN row_num INT;")

# Step 8: Create layoffs_staging2 table
# This table will store a cleaned version of layoffs_staging while tracking duplicate records.
query_create_table2 = """
CREATE TABLE layoffs_staging2 (
    company TEXT,
    location TEXT,
    industry TEXT,
    total_laid_off INT,
    percentage_laid_off TEXT,
    date TEXT,
    stage TEXT,
    country TEXT,
    funds_raised INT,
    row_num INT
);
"""
conn.execute(query_create_table2)

# Step 9: Insert data into layoffs_staging2 while assigning a row number
# The ROW_NUMBER() function helps in identifying duplicate records by assigning unique row numbers within each duplicate group.
query_insert_into_staging2 = """
INSERT INTO layoffs_staging2
(company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised, row_num)
SELECT company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised,
       ROW_NUMBER() OVER (
           PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised
       ) AS row_num
FROM layoffs_staging;
"""
conn.execute(query_insert_into_staging2)

# Step 10: Remove duplicate records
# We delete rows where row_num >= 2, keeping only the first occurrence of each duplicate.
conn.execute("DELETE FROM layoffs_staging2 WHERE row_num >= 2;")

# Step 11: Standardize missing industry names by replacing empty strings with NULL
conn.execute("UPDATE layoffs_staging2 SET industry = NULL WHERE industry = '';")

# **Fix for SQLite Compatibility: Updating NULL Industries**
# SQLite does not support JOINs in UPDATE statements, so we use a temporary table.

# Step 12: Create a temporary table mapping companies to their industry
query_create_temp_table = """
CREATE TABLE temp_industry_update AS
SELECT company, MAX(industry) AS industry
FROM layoffs_staging2
WHERE industry IS NOT NULL
GROUP BY company;
"""
conn.execute(query_create_temp_table)

# Step 13: Update layoffs_staging2 industries based on the temporary table
query_update_industry = """
UPDATE layoffs_staging2
SET industry = (SELECT temp_industry_update.industry
                FROM temp_industry_update
                WHERE layoffs_staging2.company = temp_industry_update.company)
WHERE industry IS NULL;
"""
conn.execute(query_update_industry)

# Step 14: Remove the temporary table (clean up)
conn.execute("DROP TABLE temp_industry_update;")

# Step 15: Standardize variations in industry names
conn.execute("""
UPDATE layoffs_staging2
SET industry = 'Crypto'
WHERE industry IN ('Crypto Currency', 'CryptoCurrency');
""")

# Step 16: Standardize country names
# The issue: Some country names have a trailing period (e.g., "United States." instead of "United States").
# The solution: Use REPLACE() to remove trailing periods.
conn.execute("UPDATE layoffs_staging2 SET country = REPLACE(country, '.', '');")

# Step 17: Convert date column to proper format
# The `date` column is currently stored as a text field. We update it to use the correct date format (YYYY-MM-DD).
conn.execute("UPDATE layoffs_staging2 SET date = strftime('%Y-%m-%d', date);")

# Step 18: Remove records with missing essential data
# If both `total_laid_off` and `percentage_laid_off` are NULL, the data is unusable, so we delete such records.
conn.execute("DELETE FROM layoffs_staging2 WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;")

# Step 19: Drop the `row_num` column
# The `row_num` column was useful for removing duplicates, but it is no longer needed.
query_drop_row_num = """
CREATE TABLE layoffs_staging2_no_row_num AS
SELECT company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised
FROM layoffs_staging2;
"""
conn.execute(query_drop_row_num)

# Step 20: Export the cleaned data to an Excel file
query_export_cleaned_data = "SELECT * FROM layoffs_staging2_no_row_num;"
df_cleaned = pd.read_sql(query_export_cleaned_data, conn)

# Define export file path
export_path = '/content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_After_Cleaning.xlsx'

# Export the cleaned dataset to an Excel file
df_cleaned.to_excel(export_path, index=False)

print(f"\nCleaned dataset successfully exported to: {export_path}")

# Step 21: Close the SQLite connection
# This ensures that resources are released after execution.
conn.close()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Cleaned dataset successfully exported to: /content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_After_Cleaning.xlsx
