# Companies Collection Denormalization Process

1. Companies Dataset: Contains columns such as company_id, name, description, company_size, state, country, city, zip_code, address, and url.
2. Company Industries Dataset: Contains company_id and speciality, which appears to represent various industries or areas of focus for each company.
3. Company Specialities Dataset: Also contains company_id and speciality. It seems identical to the Company Industries dataset, so I will check if there's any difference or if it's a duplicate.
4. Employee Counts Dataset: Includes company_id, employee_count, follower_count, and time_recorded.

In [34]:
import pandas as pd

# Assuming you have loaded the datasets into these variables
companies_dataset = pd.read_csv('cleaned_companies.csv')
company_specialities = pd.read_csv('cleaned_company_specialities.csv')
employee_counts = pd.read_csv('cleaned_employee_counts.csv')
company_industries = pd.read_csv('cleaned_company_industries.csv')

# Denormalizing the datasets by merging them on 'company_id'
denormalized_data = companies_dataset.merge(company_specialities, on='company_id', how='left')\
                                     .merge(employee_counts, on='company_id', how='left')\
                                     .merge(company_industries, on='company_id', how='left', suffixes=('_speciality', '_industry'))

# Save the denormalized dataset to a CSV file (optional)
denormalized_data.to_csv('denormalized_companies_dataset.csv', index=False)

# Displaying the first few rows of the denormalized dataset (optional)
denormalized_data.head()

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,speciality_speciality,employee_count,follower_count,time_recorded,speciality_industry
0,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,cloud,316130.0,16114398.0,2023-08-24 04:19:39.000000000,cloud
1,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,cloud,316130.0,16114398.0,2023-08-24 04:19:39.000000000,mobile
2,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,cloud,316130.0,16114398.0,2023-08-24 04:19:39.000000000,cognitive
3,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,cloud,316130.0,16114398.0,2023-08-24 04:19:39.000000000,security
4,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,cloud,316130.0,16114398.0,2023-08-24 04:19:39.000000000,research


In [35]:
denormalized_data.shape[0]

1494521

The dataset contains a total of 1494521 rows

# Attempting to denormalize more to reduce the document size

In [36]:
# Loading the datasets
companies_df = pd.read_csv('cleaned_companies.csv')
company_industries_df = pd.read_csv('cleaned_company_industries.csv')
company_specialities_df = pd.read_csv('cleaned_company_specialities.csv')
employee_counts_df = pd.read_csv('cleaned_employee_counts.csv')

# Displaying the first few rows of each dataset to understand their structure
datasets = {
    "Companies": companies_df.head(),
    "Company Industries": company_industries_df.head(),
    "Company Specialities": company_specialities_df.head(),
    "Employee Counts": employee_counts_df.head()
}

datasets

{'Companies':    company_id                        name  \
 0        1009                         ibm   
 1        1016               ge healthcare   
 2        1021                    ge power   
 3        1025  hewlett packard enterprise   
 4        1028                      oracle   
 
                                          description  company_size  \
 0  at ibm, we do more than work. we create. we cr...             7   
 1  every day millions of people feel the impact o...             7   
 2  ge power, part of ge vernova, is a world energ...             7   
 3  official linkedin of hewlett packard enterpris...             7   
 4  we’re a cloud technology company that provides...             7   
 
            state country              city       zip_code  \
 0       new york      us  armonk, new york          10504   
 1  Not Available      us           chicago  not available   
 2       new york      us       schenectady          12345   
 3          texas      us        

In [37]:
# Checking if the Company Industries and Company Specialities datasets are duplicates
are_datasets_duplicates = company_industries_df.equals(company_specialities_df)

are_datasets_duplicates

True

In [38]:
# Aggregating the specialities for each company into a single string
aggregated_specialities = company_industries_df.groupby('company_id')['speciality'].apply(lambda x: ', '.join(x)).reset_index()

# Merging the aggregated specialities with the Companies dataset
merged_df = pd.merge(companies_df, aggregated_specialities, on='company_id', how='left')

# Merging the Employee Counts data
final_denormalized_df = pd.merge(merged_df, employee_counts_df, on='company_id', how='left')

# Displaying the first few rows of the denormalized dataset
final_denormalized_df.head()

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,speciality,employee_count,follower_count,time_recorded
0,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,"cloud, mobile, cognitive, security, research, ...",316130.0,16114398.0,2023-08-24 04:19:39.000000000
1,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,"cloud, mobile, cognitive, security, research, ...",316130.0,16114399.0,2023-08-24 04:19:39.000000000
2,1016,ge healthcare,every day millions of people feel the impact o...,7,Not Available,us,chicago,not available,Not Available,https://www.linkedin.com/company/gehealthcare,"healthcare, biotechnology",53495.0,2060378.0,2023-08-24 04:55:10.000000000
3,1016,ge healthcare,every day millions of people feel the impact o...,7,Not Available,us,chicago,not available,Not Available,https://www.linkedin.com/company/gehealthcare,"healthcare, biotechnology",53495.0,2060382.0,2023-08-24 04:59:52.000000000
4,1016,ge healthcare,every day millions of people feel the impact o...,7,Not Available,us,chicago,not available,Not Available,https://www.linkedin.com/company/gehealthcare,"healthcare, biotechnology",53495.0,2060385.0,2023-08-24 05:05:24.000000000


In [39]:
# Removing 'follower_count' and 'time_recorded' columns from the dataset
final_denormalized_df_reduced = final_denormalized_df.drop(columns=['follower_count', 'time_recorded'])

# Displaying the first few rows of the updated dataset
final_denormalized_df_reduced.head()

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,speciality,employee_count
0,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,"cloud, mobile, cognitive, security, research, ...",316130.0
1,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,"cloud, mobile, cognitive, security, research, ...",316130.0
2,1016,ge healthcare,every day millions of people feel the impact o...,7,Not Available,us,chicago,not available,Not Available,https://www.linkedin.com/company/gehealthcare,"healthcare, biotechnology",53495.0
3,1016,ge healthcare,every day millions of people feel the impact o...,7,Not Available,us,chicago,not available,Not Available,https://www.linkedin.com/company/gehealthcare,"healthcare, biotechnology",53495.0
4,1016,ge healthcare,every day millions of people feel the impact o...,7,Not Available,us,chicago,not available,Not Available,https://www.linkedin.com/company/gehealthcare,"healthcare, biotechnology",53495.0


In [40]:
# Getting the row count of the updated dataset
row_count = final_denormalized_df_reduced.shape[0]
row_count

12581

The updated dataset contains a total of 12,581 rows

Companies are repeated in multiple rows in the dataset. There are 6,063 unique companies, but the dataset contains 12,581 rows, indicating that some companies are listed more than once. This repetition may be due to multiple records for different time periods or variations in other attributes

In [41]:
# Checking if companies are repeated in multiple rows
unique_company_count = final_denormalized_df_reduced['company_id'].nunique()
are_companies_repeated = row_count > unique_company_count

unique_company_count, are_companies_repeated

(6063, True)

In [42]:
# Modifying the dataset to ensure each company appears only once
# Retain only the row with the highest employee count for each company

# First, we sort the dataset by 'company_id' and 'employee_count' in descending order
sorted_dataset = final_denormalized_df_reduced.sort_values(by=['company_id', 'employee_count'], ascending=[True, False])

# Then, we drop duplicates, keeping only the first occurrence (which has the highest employee count)
unique_companies_dataset = sorted_dataset.drop_duplicates(subset='company_id')

# Displaying the first few rows of the updated dataset
unique_companies_dataset.head()

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,speciality,employee_count
0,1009,ibm,"at ibm, we do more than work. we create. we cr...",7,new york,us,"armonk, new york",10504,international business machines corp.,https://www.linkedin.com/company/ibm,"cloud, mobile, cognitive, security, research, ...",316130.0
2,1016,ge healthcare,every day millions of people feel the impact o...,7,Not Available,us,chicago,not available,Not Available,https://www.linkedin.com/company/gehealthcare,"healthcare, biotechnology",53495.0
22,1021,ge power,"ge power, part of ge vernova, is a world energ...",7,new york,us,schenectady,12345,1 river road,https://www.linkedin.com/company/gepower,"distributed power, gasification, generators, h...",26963.0
23,1025,hewlett packard enterprise,official linkedin of hewlett packard enterpris...,7,texas,us,houston,77389,1701 e mossy oaks rd spring,https://www.linkedin.com/company/hewlett-packa...,,70995.0
25,1028,oracle,we’re a cloud technology company that provides...,7,texas,us,austin,78741,2300 oracle way,https://www.linkedin.com/company/oracle,"enterprise, software, applications, database, ...",202050.0


In [43]:
# Saving the updated dataset (with unique companies) to a CSV file
unique_companies_dataset_file_path = 'unique_companies_dataset.csv'
unique_companies_dataset.to_csv(unique_companies_dataset_file_path, index=False)

unique_companies_dataset_file_path

'unique_companies_dataset.csv'

In [44]:
# Getting the row count of the unique companies dataset
unique_companies_row_count = unique_companies_dataset.shape[0]
unique_companies_row_count

6063