In [34]:
# Step 1: Install missing package
!pip install pandasql
# Explanation:
# This command installs the `pandasql` library, which is useful for running SQL queries on pandas DataFrames.
# Although we are not using it in this script, it's useful when you want to run SQL queries directly on a pandas DataFrame.

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# Explanation:
# This command mounts your Google Drive on the Colab instance. Once mounted, you'll be able to access files stored in your Google Drive.
# This step is essential to access and read the CSV file that contains the layoffs data.
# The CSV file path is specified later in the script for reading into a DataFrame.

# Step 3: Load the CSV file into a Pandas DataFrame
import pandas as pd

file_path = '/content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_for_Data_Cleaning.csv'
df = pd.read_csv(file_path)
# Explanation:
# This step loads the layoffs dataset from a CSV file into a Pandas DataFrame.
# The file path provided is the location of the CSV file in Google Drive.
# `pd.read_csv(file_path)` reads the CSV file and loads it into a DataFrame, which is an in-memory table-like data structure.
# This DataFrame (`df`) will be used for various operations like creating a database table, cleaning data, and running SQL queries.

# Ensure Pandas displays a maximum of 500 rows
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
# Explanation:
# These options configure how many rows and columns Pandas will display when you print a DataFrame.
# `display.max_rows` controls the maximum number of rows to display (500 in this case), which helps in displaying large datasets.
# `display.max_columns` controls how many columns to show (None means all columns are displayed).

# Step 4: Using SQLite for SQL Queries
import sqlite3

# Create an SQLite in-memory database and load the DataFrame into a table
conn = sqlite3.connect(":memory:")
df.to_sql("layoffs", conn, index=False, if_exists="replace")
# Explanation:
# `sqlite3.connect(":memory:")` creates an in-memory SQLite database. This is useful because the database is temporary and lives only during the current session.
# The `df.to_sql("layoffs", conn, index=False, if_exists="replace")` statement takes the DataFrame `df` and stores it into an SQLite table named "layoffs".
# `index=False` prevents the DataFrame index from being stored in the database.
# `if_exists="replace"` replaces the table if it already exists in the SQLite database.

# Step 5: Create layoffs_staging table with the same structure as layoffs
query_create_table = """
CREATE TABLE layoffs_staging AS
SELECT * FROM layoffs WHERE 1=0;
"""
conn.execute(query_create_table)
# Explanation:
# This query creates a new table `layoffs_staging` with the same structure (column names and data types) as the `layoffs` table.
# The `WHERE 1=0` condition ensures that no data is inserted into the new table, only the schema is copied.
# This is useful to create a "staging" table that we can use for temporary transformations.

# Step 6: Insert data from layoffs into layoffs_staging
query_insert = """
INSERT INTO layoffs_staging
SELECT * FROM layoffs;
"""
conn.execute(query_insert)
# Explanation:
# This query inserts all records from the `layoffs` table into the newly created `layoffs_staging` table.
# The data is now stored in `layoffs_staging`, where we can apply any transformations without affecting the original data.

# Step 7: Add row_num column to layoffs_staging
query_alter_table = "ALTER TABLE layoffs_staging ADD COLUMN row_num INT;"
conn.execute(query_alter_table)
# Explanation:
# This command adds a new column `row_num` to the `layoffs_staging` table.
# The `row_num` will be used later for tracking duplicate records or for other row-based operations.
# SQLite does not support direct `ALTER TABLE` to add a column with constraints or indexes.
# This is a simple column addition.

# Step 8: Create layoffs_staging2 with row_num
query_create_table2 = """
CREATE TABLE layoffs_staging2 (
    company TEXT,
    location TEXT,
    industry TEXT,
    total_laid_off INT,
    percentage_laid_off TEXT,
    date TEXT,
    stage TEXT,
    country TEXT,
    funds_raised INT,
    row_num INT
);
"""
conn.execute(query_create_table2)
# Explanation:
# This query creates a new table `layoffs_staging2` with specified columns.
# The table structure is similar to `layoffs_staging` but now includes the `row_num` column, which is crucial for identifying duplicate rows.
# This table will be used for further data transformations.

# Step 9: Insert data into layoffs_staging2 with ROW_NUMBER()
query_insert_into_staging2 = """
INSERT INTO layoffs_staging2
(company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised, row_num)
SELECT company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised,
       ROW_NUMBER() OVER (
           PARTITION BY company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised
       ) AS row_num
FROM layoffs_staging;
"""
conn.execute(query_insert_into_staging2)
# Explanation:
# This query inserts the data from `layoffs_staging` into `layoffs_staging2` and computes a `row_num` for each row.
# The `ROW_NUMBER()` window function assigns a unique number for each row within the same partition.
# This is helpful for identifying duplicates or for further analysis based on row-based logic.

# Step 10: Select records where total_laid_off is NULL
query_select_null_laid_off = """
SELECT *
FROM layoffs_staging2
WHERE total_laid_off IS NULL;
"""
df_null_laid_off = pd.read_sql(query_select_null_laid_off, conn)
print("\nRecords where total_laid_off is NULL:")
print(df_null_laid_off)
# Explanation:
# This query selects all rows where the `total_laid_off` field is `NULL`, indicating missing or incomplete data.
# This helps to identify any records that might need correction or removal.

# Step 11: Select records where both total_laid_off and percentage_laid_off are NULL
query_select_null_laid_off_and_percentage = """
SELECT *
FROM layoffs_staging2
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;
"""
df_null_laid_off_percentage = pd.read_sql(query_select_null_laid_off_and_percentage, conn)
print("\nRecords where both total_laid_off and percentage_laid_off are NULL:")
print(df_null_laid_off_percentage)
# Explanation:
# This query selects all rows where both `total_laid_off` and `percentage_laid_off` are `NULL`.
# This helps in identifying and potentially removing records that have no useful data.

# Step 12: Delete records where both total_laid_off and percentage_laid_off are NULL
query_delete_useless_data = """
DELETE FROM layoffs_staging2
WHERE total_laid_off IS NULL
AND percentage_laid_off IS NULL;
"""
conn.execute(query_delete_useless_data)
print("\nDeleted records where both total_laid_off and percentage_laid_off are NULL.")
# Explanation:
# This query deletes rows where both `total_laid_off` and `percentage_laid_off` are `NULL`.
# Deleting these rows helps remove unnecessary records that do not contribute meaningful data.

# Step 13: Verify the data after deletion
query_select_all_after_deletion = """
SELECT *
FROM layoffs_staging2;
"""
df_after_deletion = pd.read_sql(query_select_all_after_deletion, conn)
print("\nData after deletion of useless records:")
print(df_after_deletion)
# Explanation:
# This query verifies that the rows with `NULL` values in both `total_laid_off` and `percentage_laid_off` have been deleted.
# It helps to ensure the dataset is clean and ready for further analysis.

# Step 14: Drop the row_num column
query_drop_row_num = """
CREATE TABLE layoffs_staging2_no_row_num AS
SELECT company, location, industry, total_laid_off, percentage_laid_off, date, stage, country, funds_raised
FROM layoffs_staging2;
"""
conn.execute(query_drop_row_num)
print("\nDropped the row_num column.")
# Explanation:
# Since SQLite doesn't support `ALTER TABLE DROP COLUMN`, we create a new table `layoffs_staging2_no_row_num` by selecting only the necessary columns.
# This creates a new table without the `row_num` column.

# Step 15: Verify the data in layoffs_staging2_no_row_num (without row_num)
query_select_no_row_num = """
SELECT *
FROM layoffs_staging2_no_row_num;
"""
df_no_row_num = pd.read_sql(query_select_no_row_num, conn)
print("\nData in layoffs_staging2 without row_num column:")
print(df_no_row_num)
# Explanation:
# This query verifies that the new table `layoffs_staging2_no_row_num` does not contain the `row_num` column.
# This ensures the data structure is as intended after dropping the unnecessary column.

# Close the SQLite connection
conn.close()
# Explanation:
# Finally, we close the SQLite connection to free up any resources and ensure proper cleanup.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Records where total_laid_off is NULL:
         company                        location        industry  \
0    100 Thieves                 ['Los Angeles']          Retail   
1      123Milhas  ['Belo Horizonte', 'Non-U.S.']          Travel   
2             2U             ['Washington D.C.']       Education   
3             2U             ['Washington D.C.']       Education   
4             2U             ['Washington D.C.']       Education   
..           ...                             ...             ...   
811       dot.LA                 ['Los Angeles']           Media   
812         eBay        ['Tel Aviv', 'Non-U.S.']          Retail   
813  iQiyi Smart         ['Beijing', 'Non-U.S.']           Other   
814    iSpecimen                      ['Boston']      Healthcare   
815      inDrive                 ['SF Bay Area']  Transportation   

    total_laid_