# Data Cleaning - II



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# read the new, decoded csv files
rounds = pd.read_csv("rounds_clean.csv", encoding = "ISO-8859-1")
companies = pd.read_csv("companies_clean.csv", sep="\t", encoding = "ISO-8859-1")

In [None]:
# quickly verify that there are 66368 unique companies in both
# and that only the same 66368 are present in both files

# unqiue values
print(len(companies.permalink.unique()))
print(len(rounds.company_permalink.unique()))

# present in rounds but not in companies
print(len(rounds.loc[~rounds['company_permalink'].isin(companies['permalink']), :]))

# Missing Value Treatment

In [None]:
# missing values in companies df
companies.isnull().sum()

In [None]:
# missing values in rounds df
rounds.isnull().sum()

In [None]:
# merging the two dfs
master = pd.merge(companies, rounds, how="inner", left_on="permalink", right_on="company_permalink")
master.head()

In [None]:
# print column names
master.columns

In [None]:
# removing redundant columns
master =  master.drop(['company_permalink'], axis=1) 

In [None]:
# look at columns after dropping
master.columns

In [None]:
# column-wise missing values 
master.isnull().sum()

In [None]:
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(master.isnull().sum()/len(master.index)), 2)

In [None]:
# dropping columns 
master = master.drop(['funding_round_code', 'homepage_url', 'founded_at', 'state_code', 'region', 'city'], axis=1)
master.head()

In [None]:
# summing up the missing values (column-wise) and displaying fraction of NaNs
round(100*(master.isnull().sum()/len(master.index)), 2)

In [None]:
# summary stats of raised_amount_usd
master['raised_amount_usd'].describe()

In [None]:
# removing NaNs in raised_amount_usd
master = master[~np.isnan(master['raised_amount_usd'])]
round(100*(master.isnull().sum()/len(master.index)), 2)

In [None]:
country_codes = master['country_code'].astype('category')

# displaying frequencies of each category
country_codes.value_counts()

In [None]:
# viewing fractions of counts of country_codes
100*(master['country_code'].value_counts()/len(master.index))

In [None]:
# removing rows with missing country_codes
master = master[~pd.isnull(master['country_code'])]

# look at missing values
round(100*(master.isnull().sum()/len(master.index)), 2)

In [None]:
# removing rows with missing category_list values
master = master[~pd.isnull(master['category_list'])]

# look at missing values
round(100*(master.isnull().sum()/len(master.index)), 2)

In [None]:
# writing the clean dataframe to an another file
master.to_csv("master_df.csv", sep=',', index=False)

In [None]:
# look at the master df info for number of rows etc.
master.info()

In [None]:
# after missing value treatment, approx 77% observations are retained
100*(len(master.index) / len(rounds.index))