# Introduction

The notebook is intended to perform a first Exploratory Data Analysis

In [1]:
# Import Standard Modules
import pandas as pd
import collections
import re

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS

# Read Data

In [2]:
data = pd.read_csv('./../data/2014_Financial_Data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3808 entries, 0 to 3807
Columns: 225 entries, Unnamed: 0 to Class
dtypes: float64(222), int64(1), object(2)
memory usage: 6.5+ MB


In [4]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Sector,2015 PRICE VAR [%],Class
0,PG,74401000000.0,-0.0713,39030000000.0,35371000000.0,0.0,21461000000.0,21461000000.0,13910000000.0,709000000.0,...,-0.0187,-0.0217,0.0359,0.0316,0.1228,0.0,-0.1746,Consumer Defensive,-9.323276,0
1,VIPS,3734148000.0,1.1737,2805625000.0,928522600.0,108330300.0,344141400.0,793926700.0,134595900.0,12148690.0,...,,,,,,1.6484,1.7313,Consumer Defensive,-25.512193,0
2,KR,98375000000.0,0.0182,78138000000.0,20237000000.0,0.0,15196000000.0,17512000000.0,2725000000.0,443000000.0,...,0.0618,0.0981,0.1886,0.3268,0.2738,0.0,0.0234,Consumer Defensive,33.118297,1
3,RAD,25526410000.0,0.0053,18202680000.0,7323734000.0,0.0,6561162000.0,6586482000.0,737252000.0,424591000.0,...,0.0211,-0.051,-0.0189,0.1963,-0.0458,0.0,-0.006,Consumer Defensive,2.752291,1
4,GIS,17909600000.0,0.0076,11539800000.0,6369800000.0,0.0,3474300000.0,3412400000.0,2957400000.0,302400000.0,...,0.0257,0.009,0.0215,0.0274,0.1025,0.0,-0.022,Consumer Defensive,12.897715,1


# Analyse Columns

## Compute Top Common Words in the Column Names

In [5]:
# Create dictionary of columns
columns_dictionary =  {index:column for index, column in enumerate(data.columns)}

In [6]:
# Create a list of words in the columns
column_words_list = [element.lower() for row in [column.split(' ') for column in data.columns] for element in row]

In [7]:
# Add custom stopwords - NOTE: It is based on manul exploration
stop_words = STOPWORDS.union(set(['(per', 'per', 'to']))

# Remove stop words
column_words_list_cleaned = (remove_stopwords(' '.join(column_words_list))).split(' ')

print(f"Words removed: {(len(column_words_list) - len(column_words_list_cleaned))}")

Words removed: 55


In [8]:
# Count the number of occurences of each word in the column
words_counter = collections.Counter(column_words_list_cleaned)

In [11]:
# Compute the top most common words
top_n_words = 10
most_common_words = [item[0] for item in words_counter.most_common(top_n_words) if bool(re.match('^[a-zA-Z0-9]*$', item[0]))]

print(f"Top {top_n_words} most common words: {most_common_words}")

Top 10 most common words: ['growth', 'cash', 'net', 'share', 'income', 'flow', 'revenue', 'operating']


## Select Relevant Columns

In [56]:
# Select relevant columns
relevant_column = list(set([column for column in data.columns for word in most_common_words if word in column.lower()]))
relevant_column.sort()

In [57]:
len(relevant_column)

102

In [58]:
relevant_column

['10Y Dividend per Share Growth (per Share)',
 '10Y Net Income Growth (per Share)',
 '10Y Operating CF Growth (per Share)',
 '10Y Revenue Growth (per Share)',
 '10Y Shareholders Equity Growth (per Share)',
 '3Y Dividend per Share Growth (per Share)',
 '3Y Net Income Growth (per Share)',
 '3Y Operating CF Growth (per Share)',
 '3Y Revenue Growth (per Share)',
 '3Y Shareholders Equity Growth (per Share)',
 '5Y Dividend per Share Growth (per Share)',
 '5Y Net Income Growth (per Share)',
 '5Y Operating CF Growth (per Share)',
 '5Y Revenue Growth (per Share)',
 '5Y Shareholders Equity Growth (per Share)',
 'Asset Growth',
 'Book Value per Share',
 'Book Value per Share Growth',
 'Capex per Share',
 'Capex to Operating Cash Flow',
 'Capex to Revenue',
 'Cash and cash equivalents',
 'Cash and short-term investments',
 'Cash per Share',
 'Consolidated Income',
 'Cost of Revenue',
 'Debt Growth',
 'Deferred revenue',
 'Dividend per Share',
 'Dividends per Share Growth',
 'EBIT Growth',
 'EPS Di

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f1efe245-29af-4be1-bb79-055f4abb0e16' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>