In [29]:
from google.colab import drive
import pandas as pd
import sqlite3
from pandasql import sqldf

# Step 1: Mount Google Drive to access files stored in the user's Google Drive
# This allows the script to read and write files stored in Google Drive, making it accessible in Google Colab.
drive.mount('/content/drive')

# Step 2: Load the File into a Pandas DataFrame
# Define the file path to the Excel file stored in Google Drive.
# Ensure that the file exists and that the correct path is provided.
file_path = '/content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_After_Cleaning.xlsx'

# Read the Excel file into a Pandas DataFrame.
# The 'openpyxl' engine is specified to handle Excel (.xlsx) files.
df = pd.read_excel(file_path, engine='openpyxl')

# Step 3: Execute SQL Queries Using pandasql
# `pandasql` allows SQL queries to be executed on Pandas DataFrames.
query1 = "SELECT * FROM df;"  # Retrieve all data from the DataFrame.
query2 = "SELECT MAX(total_laid_off) FROM df;"  # Find the maximum number of layoffs in the dataset.

# Execute the queries using pandasql and store the results in variables.
result1 = sqldf(query1, globals())
result2 = sqldf(query2, globals())

# Display the results.
print("All Data:")
print(result1)
print("\nMaximum Layoffs:")
print(result2)

# Step 4: Use SQLite for More Efficiency
# SQLite is used to improve query performance and manage large datasets efficiently.
conn = sqlite3.connect(":memory:")  # Create an in-memory SQLite database.

# Store the DataFrame as a SQL table named 'layoffs_staging2'.
df.to_sql("layoffs_staging2", conn, index=False, if_exists="replace")

# Define multiple SQL queries for data analysis.
query1_sqlite = "SELECT * FROM layoffs_staging2;"  # Retrieve all records.
query2_sqlite = "SELECT MAX(total_laid_off) FROM layoffs_staging2;"  # Find max layoffs.
query3_sqlite = "SELECT MAX(percentage_laid_off), MIN(percentage_laid_off) FROM layoffs_staging2 WHERE percentage_laid_off IS NOT NULL;"  # Find max & min percentage layoffs.
query4_sqlite = "SELECT * FROM layoffs_staging2 WHERE percentage_laid_off = 1;"  # Get companies with 100% layoffs.
query5_sqlite = "SELECT * FROM layoffs_staging2 WHERE percentage_laid_off = 1 ORDER BY funds_raised DESC;"  # 100% layoffs ordered by funds raised.
query6_sqlite = "SELECT company, total_laid_off FROM layoffs_staging2 ORDER BY total_laid_off DESC LIMIT 5;"  # Top 5 companies by layoffs.
query7_sqlite = "SELECT company, SUM(total_laid_off) FROM layoffs_staging2 GROUP BY company ORDER BY SUM(total_laid_off) DESC LIMIT 10;"  # Top 10 companies by total layoffs.
query8_sqlite = "SELECT location, SUM(total_laid_off) FROM layoffs_staging2 GROUP BY location ORDER BY SUM(total_laid_off) DESC LIMIT 10;"  # Top 10 locations by layoffs.
query9_sqlite = "SELECT country, SUM(total_laid_off) FROM layoffs_staging2 GROUP BY country ORDER BY SUM(total_laid_off) DESC;"  # Layoffs by country.
query10_sqlite = "SELECT strftime('%Y', date) AS year, SUM(total_laid_off) FROM layoffs_staging2 GROUP BY year ORDER BY year ASC;"  # Layoffs by year.
query11_sqlite = "SELECT industry, SUM(total_laid_off) FROM layoffs_staging2 GROUP BY industry ORDER BY SUM(total_laid_off) DESC;"  # Layoffs by industry.
query12_sqlite = "SELECT stage, SUM(total_laid_off) FROM layoffs_staging2 GROUP BY stage ORDER BY SUM(total_laid_off) DESC;"  # Layoffs by stage.
query13_sqlite = """
WITH Company_Year AS (
  SELECT company, strftime('%Y', date) AS years, SUM(total_laid_off) AS total_laid_off
  FROM layoffs_staging2
  GROUP BY company, years
),
Company_Year_Rank AS (
  SELECT company, years, total_laid_off, DENSE_RANK() OVER (PARTITION BY years ORDER BY total_laid_off DESC) AS ranking
  FROM Company_Year
)
SELECT company, years, total_laid_off, ranking
FROM Company_Year_Rank
WHERE ranking <= 3 AND years IS NOT NULL
ORDER BY years ASC, total_laid_off DESC;
"""  # Top 3 layoffs per year.
query14_sqlite = """
SELECT SUBSTRING(date,1,7) as dates, SUM(total_laid_off) AS total_laid_off
FROM layoffs_staging2
GROUP BY dates
ORDER BY dates ASC;
"""  # Monthly layoffs.
query15_sqlite = """
WITH DATE_CTE AS (
  SELECT SUBSTRING(date,1,7) as dates, SUM(total_laid_off) AS total_laid_off
  FROM layoffs_staging2
  GROUP BY dates
  ORDER BY dates ASC
)
SELECT dates, SUM(total_laid_off) OVER (ORDER BY dates ASC) as rolling_total_layoffs
FROM DATE_CTE
ORDER BY dates ASC;
"""  # Rolling layoffs trend.

# Execute queries and store results.
result1_sqlite = pd.read_sql(query1_sqlite, conn)
result2_sqlite = pd.read_sql(query2_sqlite, conn)
result3_sqlite = pd.read_sql(query3_sqlite, conn)
result4_sqlite = pd.read_sql(query4_sqlite, conn)
result5_sqlite = pd.read_sql(query5_sqlite, conn)
result6_sqlite = pd.read_sql(query6_sqlite, conn)
result7_sqlite = pd.read_sql(query7_sqlite, conn)
result8_sqlite = pd.read_sql(query8_sqlite, conn)
result9_sqlite = pd.read_sql(query9_sqlite, conn)
result10_sqlite = pd.read_sql(query10_sqlite, conn)
result11_sqlite = pd.read_sql(query11_sqlite, conn)
result12_sqlite = pd.read_sql(query12_sqlite, conn)
result13_sqlite = pd.read_sql(query13_sqlite, conn)
result14_sqlite = pd.read_sql(query14_sqlite, conn)
result15_sqlite = pd.read_sql(query15_sqlite, conn)

# Save results to an Excel file with multiple sheets.
output_path = '/content/drive/My Drive/Data_Analyst/Portfolio_Projects/Projects/SQL/Layoffs_Dataset_After_Analysis.xlsx'
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    result1_sqlite.to_excel(writer, sheet_name='All Data', index=False)
    result2_sqlite.to_excel(writer, sheet_name='Max Layoffs', index=False)
    result3_sqlite.to_excel(writer, sheet_name='Max Min % Laid Off', index=False)
    result4_sqlite.to_excel(writer, sheet_name='100% Layoffs', index=False)
    result5_sqlite.to_excel(writer, sheet_name='100% Layoffs by Funds', index=False)
    result6_sqlite.to_excel(writer, sheet_name='Top 5 Single-Day', index=False)
    result7_sqlite.to_excel(writer, sheet_name='Top 10 Companies', index=False)
    result8_sqlite.to_excel(writer, sheet_name='Top 10 Locations', index=False)
    result9_sqlite.to_excel(writer, sheet_name='Layoffs by Country', index=False)
    result10_sqlite.to_excel(writer, sheet_name='Layoffs by Year', index=False)
    result11_sqlite.to_excel(writer, sheet_name='Layoffs by Industry', index=False)
    result12_sqlite.to_excel(writer, sheet_name='Layoffs by Stage', index=False)
    result13_sqlite.to_excel(writer, sheet_name='Top 3 by Year', index=False)
    result14_sqlite.to_excel(writer, sheet_name='Monthly Layoffs', index=False)
    result15_sqlite.to_excel(writer, sheet_name='Rolling Layoffs', index=False)

print("Analysis saved to Excel successfully.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All Data:
         company                  location        industry  total_laid_off  \
0      F-Secure   ['Helsinki', 'Non-U.S.']        Security            70.0   
1          #Paid   ['Toronto', 'Non-U.S.']       Marketing            19.0   
2      1K Kirana  ['Gurugram', 'Non-U.S.']          Retail           600.0   
3        23andMe           ['SF Bay Area']      Healthcare            71.0   
4        23andMe           ['SF Bay Area']      Healthcare            75.0   
...          ...                       ...             ...             ...   
1734      iRobot                ['Boston']        Consumer           350.0   
1735   iSpecimen                ['Boston']      Healthcare             NaN   
1736     inDrive           ['SF Bay Area']  Transportation             NaN   
1737     mPharma     ['Accra', 'Non-U.S.']      Healthcare           150.0   
173