In [36]:
# Step 1: Install missing package
!pip install pandasql

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 3: Load the CSV file into a Pandas DataFrame
import pandas as pd
import sqlite3

file_path = '/content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_for_Data_Cleaning.csv'
df = pd.read_csv(file_path)

# Ensure Pandas displays a maximum of 500 rows
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

# Step 4: Create an SQLite in-memory database and load the DataFrame
conn = sqlite3.connect(":memory:")
df.to_sql("layoffs", conn, index=False, if_exists="replace")

# Step 5: Create layoffs_staging table
query_create_table = "CREATE TABLE layoffs_staging AS SELECT * FROM layoffs WHERE 1=0;"
conn.execute(query_create_table)

# Step 6: Insert data into layoffs_staging
query_insert = "INSERT INTO layoffs_staging SELECT * FROM layoffs;"
conn.execute(query_insert)

# Step 7: Add row_num column
conn.execute("ALTER TABLE layoffs_staging ADD COLUMN row_num INT;")

# Step 8: Create layoffs_staging2 table
query_create_table2 = """
CREATE TABLE layoffs_staging2 (
    company TEXT,
    location TEXT,
    industry TEXT,
    total_laid_off INT,
    percentage_laid_off TEXT,
    date TEXT,
    stage TEXT,
    country TEXT,
    funds_raised INT,
    row_num INT
);
"""
conn.execute(query_create_table2)

# Step 9: Insert data into layoffs_staging2 with row numbers
query_insert_into_staging2 = """
INSERT INTO layoffs_staging2
(company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised, row_num)
SELECT company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised,
       ROW_NUMBER() OVER (
           PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised
       ) AS row_num
FROM layoffs_staging;
"""
conn.execute(query_insert_into_staging2)

# Step 10: Remove duplicates (keep only row_num = 1)
conn.execute("DELETE FROM layoffs_staging2 WHERE row_num >= 2;")

# Step 11: Standardize industry names
conn.execute("UPDATE layoffs_staging2 SET industry = NULL WHERE industry = '';")

# **Fix for SQLite Compatibility: Updating NULL Industries**
# Step 1: Create a temporary table with company-wise industry data (ignoring NULLs)
query_create_temp_table = """
CREATE TABLE temp_industry_update AS
SELECT company, MAX(industry) AS industry
FROM layoffs_staging2
WHERE industry IS NOT NULL
GROUP BY company;
"""
conn.execute(query_create_temp_table)

# Step 2: Update layoffs_staging2 using the temporary table
query_update_industry = """
UPDATE layoffs_staging2
SET industry = (SELECT temp_industry_update.industry
                FROM temp_industry_update
                WHERE layoffs_staging2.company = temp_industry_update.company)
WHERE industry IS NULL;
"""
conn.execute(query_update_industry)

# Step 3: Drop the temporary table (clean up)
conn.execute("DROP TABLE temp_industry_update;")

# **Continue with standardization**
conn.execute("""
UPDATE layoffs_staging2
SET industry = 'Crypto'
WHERE industry IN ('Crypto Currency', 'CryptoCurrency');
""")

# Step 12: Standardize country names (fix trailing periods)
conn.execute("UPDATE layoffs_staging2 SET country = REPLACE(country, '.', '');")

# Step 13: Convert date column to proper format
conn.execute("UPDATE layoffs_staging2 SET date = strftime('%Y-%m-%d', date);")

# Step 14: Remove records where both total_laid_off and percentage_laid_off are NULL
conn.execute("DELETE FROM layoffs_staging2 WHERE total_laid_off IS NULL AND percentage_laid_off IS NULL;")

# Step 15: Drop row_num column
query_drop_row_num = """
CREATE TABLE layoffs_staging2_no_row_num AS
SELECT company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised
FROM layoffs_staging2;
"""
conn.execute(query_drop_row_num)

# Step 16: Export cleaned data to an Excel file
query_export_cleaned_data = "SELECT * FROM layoffs_staging2_no_row_num;"
df_cleaned = pd.read_sql(query_export_cleaned_data, conn)

# Define export file path
export_path = '/content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_After_Cleaning.xlsx'

# Export to Excel
df_cleaned.to_excel(export_path, index=False)

print(f"\nCleaned dataset successfully exported to: {export_path}")

# Close the SQLite connection
conn.close()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Cleaned dataset successfully exported to: /content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_After_Cleaning.xlsx
