# Cleaning of companies related datasets

We first start by loading the dataset and displaying the head of each of them for an overview

In [1]:
import pandas as pd

# Load the datasets
companies_df = pd.read_csv('companies.csv')
company_industries_df = pd.read_csv('company_industries.csv')
company_specialities_df = pd.read_csv('company_specialities.csv')
employee_counts_df = pd.read_csv('employee_counts.csv')

# Display the first few rows of each dataset for an overview
datasets_overview = {
    "companies": companies_df.head(),
    "company_industries": company_industries_df.head(),
    "company_specialities": company_specialities_df.head(),
    "employee_counts": employee_counts_df.head()
}

datasets_overview

{'companies':    company_id                        name  \
 0        1009                         IBM   
 1        1016               GE HealthCare   
 2        1021                    GE Power   
 3        1025  Hewlett Packard Enterprise   
 4        1028                      Oracle   
 
                                          description  company_size  state  \
 0  At IBM, we do more than work. We create. We cr...           7.0     NY   
 1  Every day millions of people feel the impact o...           7.0      0   
 2  GE Power, part of GE Vernova, is a world energ...           7.0     NY   
 3  Official LinkedIn of Hewlett Packard Enterpris...           7.0  Texas   
 4  We’re a cloud technology company that provides...           7.0  Texas   
 
   country              city zip_code                                address  \
 0      US  Armonk, New York    10504  International Business Machines Corp.   
 1      US           Chicago        0                                      -   

Then we find if there are any missing values, duplicates and also if the datatypes are appropriate

In [2]:
# Checking for missing values in each dataset
missing_values = {
    "companies_missing_values": companies_df.isnull().sum(),
    "company_industries_missing_values": company_industries_df.isnull().sum(),
    "company_specialities_missing_values": company_specialities_df.isnull().sum(),
    "employee_counts_missing_values": employee_counts_df.isnull().sum()
}

# Checking data types of each column in the datasets
data_types = {
    "companies_data_types": companies_df.dtypes,
    "company_industries_data_types": company_industries_df.dtypes,
    "company_specialities_data_types": company_specialities_df.dtypes,
    "employee_counts_data_types": employee_counts_df.dtypes
}

# Checking for duplicates in each dataset
duplicate_counts = {
    "companies_duplicates": companies_df.duplicated().sum(),
    "company_industries_duplicates": company_industries_df.duplicated().sum(),
    "company_specialities_duplicates": company_specialities_df.duplicated().sum(),
    "employee_counts_duplicates": employee_counts_df.duplicated().sum()
}

missing_values, data_types, duplicate_counts

({'companies_missing_values': company_id         0
  name               1
  description      143
  company_size    1105
  state              8
  country            0
  city               1
  zip_code          12
  address           11
  url                0
  dtype: int64,
  'company_industries_missing_values': company_id    0
  industry      0
  dtype: int64,
  'company_specialities_missing_values': company_id    0
  speciality    0
  dtype: int64,
  'employee_counts_missing_values': company_id        0
  employee_count    0
  follower_count    0
  time_recorded     0
  dtype: int64},
 {'companies_data_types': company_id        int64
  name             object
  description      object
  company_size    float64
  state            object
  country          object
  city             object
  zip_code         object
  address          object
  url              object
  dtype: object,
  'company_industries_data_types': company_id     int64
  industry      object
  dtype: object,
  'company

All missing values in the name, description, state, city, zip_code, and address columns were filled with the placeholder 'Unknown'.
Missing values in the company_size column were filled with the median value of that column

In [3]:
# Filling missing values for companies dataset
# For categorical data, we'll fill missing values with a placeholder 'Unknown'
# For numerical data, we'll fill with the median value of that column

companies_df_filled = companies_df.copy()
companies_df_filled['name'].fillna('Unknown', inplace=True)
companies_df_filled['description'].fillna('Unknown', inplace=True)
companies_df_filled['company_size'].fillna(companies_df['company_size'].median(), inplace=True)
companies_df_filled['state'].fillna('Unknown', inplace=True)
companies_df_filled['city'].fillna('Unknown', inplace=True)
companies_df_filled['zip_code'].fillna('Unknown', inplace=True)
companies_df_filled['address'].fillna('Unknown', inplace=True)

# Check if filling was successful
filled_check = {
    "companies_filled_missing_values": companies_df_filled.isnull().sum(),
    "original_companies_missing_values": companies_df.isnull().sum()
}

filled_check

{'companies_filled_missing_values': company_id      0
 name            0
 description     0
 company_size    0
 state           0
 country         0
 city            0
 zip_code        0
 address         0
 url             0
 dtype: int64,
 'original_companies_missing_values': company_id         0
 name               1
 description      143
 company_size    1105
 state              8
 country            0
 city               1
 zip_code          12
 address           11
 url                0
 dtype: int64}

The time_recorded column in the Employee Counts dataset has been successfully converted to datetime format. This conversion ensures the timestamps are in a more readable and usable format for any time-based analysis or operations you might need to perform

In [4]:
import pandas as pd

# Convert 'time_recorded' to datetime format in the employee_counts dataset
employee_counts_df['time_recorded'] = pd.to_datetime(employee_counts_df['time_recorded'], unit='s')

# Check the first few rows to verify the conversion
employee_counts_df.head()

Unnamed: 0,company_id,employee_count,follower_count,time_recorded
0,81149246,6,91,2023-08-21 19:04:04.277973504
1,10033339,3,187,2023-08-21 19:04:04.277973504
2,6049228,20,82,2023-08-21 19:04:05.101318400
3,2641066,45,2336,2023-08-21 19:04:05.923216640
4,96649998,0,2,2023-08-21 19:04:05.924218880


We try to see the number of rows and the list of columns in companies dataset after cleaning

In [5]:
# Counting the number of rows in the companies dataset
num_rows_companies = companies_df.shape[0]
num_rows_companies

11361

In [6]:
# Listing the column names in the companies dataset
columns_companies = companies_df.columns.tolist()
columns_companies

['company_id',
 'name',
 'description',
 'company_size',
 'state',
 'country',
 'city',
 'zip_code',
 'address',
 'url']

After attempting to remove duplicates based on the company_id column from the Companies dataset, the number of rows remains the same at 11,361. This indicates that each company_id is unique in the dataset, and there were no duplicate entries based on this identifier.

In [7]:
# Removing duplicates based on company_id
# Since company_id should be unique for each company, we will keep only the first occurrence
companies_df_unique = companies_df.drop_duplicates(subset='company_id', keep='first')

# Counting the number of rows after removing duplicates
num_rows_companies_unique = companies_df_unique.shape[0]
num_rows_companies_unique

11361

The Employee Counts dataset has been modified to ensure that each company appears only once, retaining the row with the highest follower count for companies that had multiple entries. The number of rows in the modified dataset is now 11,323

In [8]:
# Modifying the employee_counts dataset to ensure each company appears only once
# We will keep the row with the highest follower count for each company

# Sorting by company_id and follower_count (descending) to have the highest follower count at the top
employee_counts_sorted = employee_counts_df.sort_values(by=['company_id', 'follower_count'], ascending=[True, False])

# Dropping duplicates, keeping the first occurrence (which has the highest follower count due to sorting)
employee_counts_unique = employee_counts_sorted.drop_duplicates(subset='company_id', keep='first')

# Counting the number of rows after modification
num_rows_employee_counts_unique = employee_counts_unique.shape[0]
num_rows_employee_counts_unique

11323

The Company Industries dataset has been modified so that each company appears only once, with all of its associated industries combined into an array of strings in the same row. This format allows for a more consolidated view of each company's industries.

In [9]:
# Modifying the company_industries dataset to have each company appear only once
# with all associated industries in the same row as an array of strings

# Grouping by company_id and aggregating industries into a list
company_industries_grouped = company_industries_df.groupby('company_id')['industry'].apply(list).reset_index()

# Check the first few rows of the modified dataset
company_industries_grouped.head()

Unnamed: 0,company_id,industry
0,1009,"[Information Technology & Services, IT Service..."
1,1016,"[Hospital & Health Care, Hospitals and Health ..."
2,1021,"[Renewables & Environment, Renewable Energy Se..."
3,1025,"[Information Technology & Services, IT Service..."
4,1028,"[Information Technology & Services, IT Service..."


The Company Specialities dataset has been modified so that each company appears only once, with all of its associated specialities combined into an array of strings in the same row. This format provides a consolidated view of each company's specialities.

In [10]:
# Modifying the company_specialities dataset to have each company appear only once
# with all associated specialities in the same row as an array of strings

# Grouping by company_id and aggregating specialities into a list
company_specialities_grouped = company_specialities_df.groupby('company_id')['speciality'].apply(list).reset_index()

# Check the first few rows of the modified dataset
company_specialities_grouped.head()

Unnamed: 0,company_id,speciality
0,1009,"[Cloud, Mobile, Cognitive, Security, Research,..."
1,1016,"[Healthcare, Biotechnology]"
2,1021,"[Distributed Power, Gasification, Generators, ..."
3,1028,"[enterprise, software, applications, database,..."
4,1033,"[Management Consulting, Systems Integration an..."
