# Data Cleaning


In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# reading data files
# using encoding = "ISO-8859-1" to avoid pandas encoding error
rounds = pd.read_csv("rounds2.csv", encoding = "ISO-8859-1")
companies = pd.read_csv("companies.txt", sep="\t", encoding = "ISO-8859-1")


In [None]:
# Look at rounds head
print(rounds.head())

In [None]:
# inspect the structure etc.
print(rounds.info(), "\n")
print(rounds.shape)


In [None]:
# look at companies head
companies.head()

In [None]:
# companies structure
companies.info()

In [None]:
# identify the unique number of permalinks in companies
len(companies.permalink.unique())

In [None]:
# converting all permalinks to lowercase
companies['permalink'] = companies['permalink'].str.lower()
companies.head()


In [None]:
# look at unique values again
len(companies.permalink.unique())

In [None]:
# look at unique company names in rounds df
# note that the column name in rounds file is different (company_permalink)
len(rounds.company_permalink.unique())


In [None]:
# converting column to lowercase
rounds['company_permalink'] = rounds['company_permalink'].str.lower()
rounds.head()

In [None]:
# Look at unique values again
len(rounds.company_permalink.unique())

In [None]:
# companies present in rounds file but not in (~) companies file
rounds.loc[~rounds['company_permalink'].isin(companies['permalink']), :]

In [None]:
# looking at the indices with weird characters
rounds_original = pd.read_csv("rounds2.csv", encoding = "ISO-8859-1")
rounds_original.iloc[[29597, 31863, 45176, 58473], :]

In [None]:
# import chardet

# rawdata = open('rounds2.csv', 'rb').read()
# result = chardet.detect(rawdata)
# charenc = result['encoding']
# print(charenc)

In [None]:
# print(result)

In [None]:
# trying different encodings
# encoding="cp1254" throws an error
# rounds_original = pd.read_csv("rounds2.csv", encoding="cp1254")
# rounds_original.iloc[[29597, 31863, 45176], :]

In [None]:
rounds['company_permalink'] = rounds.company_permalink.str.encode('utf-8').str.decode('ascii', 'ignore')
rounds.loc[~rounds['company_permalink'].isin(companies['permalink']), :]

In [None]:
# Look at unique values again
len(rounds.company_permalink.unique())

In [None]:
# companies present in companies df but not in rounds df
companies.loc[~companies['permalink'].isin(rounds['company_permalink']), :]


In [None]:
# remove encoding from companies df
companies['permalink'] = companies.permalink.str.encode('utf-8').str.decode('ascii', 'ignore')


In [None]:
# companies present in companies df but not in rounds df
companies.loc[~companies['permalink'].isin(rounds['company_permalink']), :]


In [None]:
# write rounds file
rounds.to_csv("rounds_clean.csv", sep=',', index=False)

# write companies file
companies.to_csv("companies_clean.csv", sep='\t', index=False)