# Dataset - Layovers 2022

In [1]:
import pandas as pd
import numpy as np
import statistics
import seaborn as sns

## Reading the csv file

In [2]:
df = pd.read_csv('layoffs.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1714 entries, 0 to 1713
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   company              1714 non-null   object 
 1   location             1714 non-null   object 
 2   industry             1711 non-null   object 
 3   total_laid_off       1307 non-null   float64
 4   percentage_laid_off  1257 non-null   float64
 5   date                 1714 non-null   object 
 6   country              1714 non-null   object 
 7   funds_raised         1591 non-null   float64
dtypes: float64(3), object(5)
memory usage: 107.2+ KB


In [4]:
df.describe()

Unnamed: 0,total_laid_off,percentage_laid_off,funds_raised
count,1307.0,1257.0,1591.0
mean,183.005356,0.25388,873.347137
std,552.186853,0.267185,6418.42338
min,0.0,0.0,0.0
25%,27.0,0.09,44.0
50%,60.0,0.17,132.0
75%,146.0,0.3,380.5
max,11000.0,1.0,121900.0


In [5]:
# Checking if there are any Not defined (NaN values) in the dataset

x = float("nan")
print(f" Not defined values present in the dataset?  : {pd.isna(x)}")

 Not defined values present in the dataset?  : True


In [6]:
#Listing the data where few 'Not defined values' are visible

df.tail(10)

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,country,funds_raised
1704,Popin,New York City,Fitness,0.0,1.0,3/19/20,United States,13.0
1705,Tuft & Needle,Phoenix,Retail,,,3/19/20,United States,0.0
1706,Flytedesk,Boulder,Marketing,4.0,0.2,3/18/20,United States,4.0
1707,Inspirato,Denver,Travel,130.0,0.22,3/16/20,United States,79.0
1708,Help.com,Austin,Support,16.0,1.0,3/16/20,United States,6.0
1709,Service,Los Angeles,Travel,,1.0,3/16/20,United States,5.1
1710,HopSkipDrive,Los Angeles,Transportation,8.0,0.1,3/13/20,United States,45.0
1711,Panda Squad,SF Bay Area,Consumer,6.0,0.75,3/13/20,United States,1.0
1712,Tamara Mellon,Los Angeles,Retail,20.0,0.4,3/12/20,United States,90.0
1713,EasyPost,Salt Lake City,Logistics,75.0,,3/11/20,United States,12.0


## Column wise checking if not defined values are present

In [7]:
print(f" Are there NaN values in column total_laid_off? : {df['total_laid_off'].isnull().values.any()}")
print(f" Are there NaN values in column percentage_laid_off? : {df['percentage_laid_off'].isnull().values.any()}")
print(f" Are there NaN values in column funds_raised? : {df['funds_raised'].isnull().values.any()}")
print(f" Are there NaN values in column location? : {df['location'].isnull().values.any()}")


 Are there NaN values in column total_laid_off? : True
 Are there NaN values in column percentage_laid_off? : True
 Are there NaN values in column funds_raised? : True
 Are there NaN values in column location? : False


In [8]:
df.tail(10)

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,country,funds_raised
1704,Popin,New York City,Fitness,0.0,1.0,3/19/20,United States,13.0
1705,Tuft & Needle,Phoenix,Retail,,,3/19/20,United States,0.0
1706,Flytedesk,Boulder,Marketing,4.0,0.2,3/18/20,United States,4.0
1707,Inspirato,Denver,Travel,130.0,0.22,3/16/20,United States,79.0
1708,Help.com,Austin,Support,16.0,1.0,3/16/20,United States,6.0
1709,Service,Los Angeles,Travel,,1.0,3/16/20,United States,5.1
1710,HopSkipDrive,Los Angeles,Transportation,8.0,0.1,3/13/20,United States,45.0
1711,Panda Squad,SF Bay Area,Consumer,6.0,0.75,3/13/20,United States,1.0
1712,Tamara Mellon,Los Angeles,Retail,20.0,0.4,3/12/20,United States,90.0
1713,EasyPost,Salt Lake City,Logistics,75.0,,3/11/20,United States,12.0


### Filtering out the bad data

In [9]:
# Replacing the NaN values in column 'total_laid_off' with '0' value of the same column

df['total_laid_off'].fillna(0, inplace = True)
df

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,country,funds_raised
0,BloomTech,SF Bay Area,Education,88.0,0.50,12/1/22,United States,
1,Podium,Lehi,Support,0.0,0.12,12/1/22,United States,419.0
2,DoorDash,SF Bay Area,Food,1250.0,0.06,11/30/22,United States,2500.0
3,Kraken,SF Bay Area,Crypto,1100.0,0.30,11/30/22,United States,134.0
4,Happy Money,Los Angeles,Finance,158.0,0.34,11/30/22,United States,191.0
...,...,...,...,...,...,...,...,...
1709,Service,Los Angeles,Travel,0.0,1.00,3/16/20,United States,5.1
1710,HopSkipDrive,Los Angeles,Transportation,8.0,0.10,3/13/20,United States,45.0
1711,Panda Squad,SF Bay Area,Consumer,6.0,0.75,3/13/20,United States,1.0
1712,Tamara Mellon,Los Angeles,Retail,20.0,0.40,3/12/20,United States,90.0


In [10]:
# Replacing the NaN values in column 'percentage_laid_off' with '0' value of the same column

df['percentage_laid_off'].fillna(0, inplace = True)
df

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,country,funds_raised
0,BloomTech,SF Bay Area,Education,88.0,0.50,12/1/22,United States,
1,Podium,Lehi,Support,0.0,0.12,12/1/22,United States,419.0
2,DoorDash,SF Bay Area,Food,1250.0,0.06,11/30/22,United States,2500.0
3,Kraken,SF Bay Area,Crypto,1100.0,0.30,11/30/22,United States,134.0
4,Happy Money,Los Angeles,Finance,158.0,0.34,11/30/22,United States,191.0
...,...,...,...,...,...,...,...,...
1709,Service,Los Angeles,Travel,0.0,1.00,3/16/20,United States,5.1
1710,HopSkipDrive,Los Angeles,Transportation,8.0,0.10,3/13/20,United States,45.0
1711,Panda Squad,SF Bay Area,Consumer,6.0,0.75,3/13/20,United States,1.0
1712,Tamara Mellon,Los Angeles,Retail,20.0,0.40,3/12/20,United States,90.0


In [11]:
# Replacing the NaN values in column 'funds_raised' with '0' value of the same column

df['funds_raised'].fillna(0, inplace = True)
df

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,country,funds_raised
0,BloomTech,SF Bay Area,Education,88.0,0.50,12/1/22,United States,0.0
1,Podium,Lehi,Support,0.0,0.12,12/1/22,United States,419.0
2,DoorDash,SF Bay Area,Food,1250.0,0.06,11/30/22,United States,2500.0
3,Kraken,SF Bay Area,Crypto,1100.0,0.30,11/30/22,United States,134.0
4,Happy Money,Los Angeles,Finance,158.0,0.34,11/30/22,United States,191.0
...,...,...,...,...,...,...,...,...
1709,Service,Los Angeles,Travel,0.0,1.00,3/16/20,United States,5.1
1710,HopSkipDrive,Los Angeles,Transportation,8.0,0.10,3/13/20,United States,45.0
1711,Panda Squad,SF Bay Area,Consumer,6.0,0.75,3/13/20,United States,1.0
1712,Tamara Mellon,Los Angeles,Retail,20.0,0.40,3/12/20,United States,90.0


In [12]:
# Checking if any null values are still present

print(f" Are there NaN values in column total_laid_off? : {df['total_laid_off'].isnull().values.any()}")
print(f" Are there NaN values in column percentage_laid_off? : {df['percentage_laid_off'].isnull().values.any()}")
print(f" Are there NaN values in column percentage_laid_off? : {df['funds_raised'].isnull().values.any()}")

 Are there NaN values in column total_laid_off? : False
 Are there NaN values in column percentage_laid_off? : False
 Are there NaN values in column percentage_laid_off? : False


In [None]:
# dropping off the column 'stage' as it consists huge number of unknown values and is not required for any computation

df.drop('stage', inplace=True, axis=1)
df


## The data is now cleaned, we can move to Analysis

In [13]:
df.to_csv('layoffs_updated')