## The main objective of this notebook is to explore the public income statement files 

In [212]:
import pandas as pd
import zipfile
import os

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [220]:
import re
def getIndicatorsValues(df,row_index):
    row = (df.iloc[row_index]).iloc[2:]
    list_values = [str(row.iloc[i]) for i in range(len(row)) if i%2==1]
    for i in range(len(list_values)):
        if re.findall("[a-zA-Z]",list_values[i]):
            list_values[i] = np.NaN
        else:
            list_values[i] = float(list_values[i])
    return pd.Series(list_values)

def getListOfDates(df, row_index):
    '''
    Get all the dates in a file in Date format
    inputs: A datframe containing the data of a file and a index of a row 
    output: The list of all the dates in the file
    Exception: The row index doesn"t exist or the dataframe doesn't exist
    '''
    row = (df.iloc[row_index]).iloc[2:]
    return pd.Series([row.iloc[i] for i in range(len(row)) if i%2 == 0 and i!=0])

def get_file_identifier(df):
    '''
    return the identifier of a file
    input: dataframe
    output:String representing the identifier of a file
    '''
    try:
        return df.iloc[0,0]
    except Exception as ex:
        print(ex)

In [265]:
#Create a dictionnary with Date and Indicators values
def getDataDict(df):
    datadict = {}
    datadict["CompanyId"] = get_file_identifier(df)
    #A problem to solve: get the right cells for date
    datadict["Dates"]= getListOfDates(df,1)
    for i in range(df.shape[0]):
        datadict[getIndicatorName(df,i)]= getIndicatorsValues(df,i)
        
    return datadict


In [266]:

#NotebookApp.iopub_data_rate_limit=10000000.0 (bytes/sec)
#NotebookApp.rate_limit_window=3.0 (secs)

# Take a  file, get a new dataframe with indicators in columns and dates as index
def dfToDf(df):
    
    df_dict = getDataDict(df)
    return pd.DataFrame(df_dict)


In [267]:
#A function take a  zip of csv files and transform it to a Dataframe with all csv files data
def transform(zipFile, delimiter):
    """
    input: the zip file and a  delimiter for the csv files
    
    output: A dataframe containing all the data for the zip file
    
    """
    zf = zipfile.ZipFile(zipFile)
    
    textfiles = zf.infolist()
    
    list_ = []
    
    for textfile in textfiles:
        csv_file = zf.open(textfile.filename)
        
        first_line =  str(csv_file.readline())
        
        num_cols = first_line.count(delimiter)
        
        df = pd.read_csv(csv_file, header = None, parse_dates = [i for i in range(num_cols) if  i%2 == 0 and i!=0])
        
        dfnew  = dfToDf(df)
        
        list_.append(dfnew)
    
    frame = pd.concat(list_, axis = 0)
    
   
    
    return frame
    

In [258]:
dfs_software_public_income = transform('software_public_quarterly_income_statements_1.zip', ',')

In [259]:
dfs_software_public_income.set_index("Dates",inplace=True)

In [260]:
dfs_software_public_income.head(8)

Unnamed: 0_level_0,CompanyId,IQ_OTHER_UNUSUAL_SUPPL,IQ_NI_COMPANY,IQ_RESTRUCTURE,IQ_COST_REV,IQ_OTHER_OPER,IQ_RD_EXP_FN,IQ_DILUT_EPS_EXCL,IQ_EBITDA_EXCL_SBC,IQ_NI_NORM,...,IQ_TOTAL_UNUSUAL_SUPPLE,IQ_PERIODLENGTH_IS,IQ_EBT,IQ_EBITA,IQ_OTHER_NON_OPER_EXP_SUPPL,IQ_EBITA_EXCL_SBC,IQ_DILUT_EPS_NORM,IQ_RESTATEMENT_IS,IQ_TOTAL_REV_SHARE,IQ_EBITA_EQ_INC_EXCL_SBC
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NaT,IQ34280323,,-2.979612,,-1.686575,,,-0.062125,-2.448205,-1.680971,...,,3.0,-2.974936,-2.486833,,-2.486833,-0.033572,,0.076811,-2.486833
NaT,IQ34280323,,-1.260563,,5.813539,,,-0.02396,-1.49475,-1.104248,...,,3.0,-1.686896,-1.533378,,-1.533378,-0.020189,,0.078255,-1.533378
NaT,IQ34280323,,0.974995,,5.397071,,,0.015496,1.413825,0.734434,...,,3.0,1.378918,1.375197,,1.375197,0.013427,,0.123819,1.375197
NaT,IQ34280323,,-0.088424,,5.001677,,,-0.003307,0.006088,-0.134695,...,,3.0,-0.067568,-0.03254,,-0.03254,-0.002462,,0.090852,-0.03254
NaT,IQ34280323,,-1.465225,,-0.345158,,,-0.02949,-1.946971,-1.358212,...,,3.0,-1.936756,-1.978318,-0.000634,-1.978318,-0.024832,,0.106449,-1.978318
NaT,IQ34280323,,-0.52027,,4.892946,,,-0.01,-0.360704,-0.440245,...,,3.0,-0.704392,-0.621659,,-0.621659,-0.008461,,0.125782,-0.621659
NaT,IQ34280323,,3.738703,,4.671531,,,0.07,3.001414,2.780149,...,,3.0,4.448238,2.726325,,2.726325,0.052052,,0.180506,4.471451
NaT,IQ34280323,,0.494014,,4.161289,,,0.007984,0.984304,0.41894,...,,3.0,0.670304,0.693764,,0.693764,0.00677,,0.115617,0.693764


In [261]:
#Get the number of duplicate dates 
duplic = dfs_software_public_income.index.get_duplicates()
print(duplic)

DatetimeIndex([       'NaT', '2010-12-20', '2017-05-03', '2017-12-02',
               '2018-05-02', '2018-12-03', '2013-07-25', '2014-03-03',
               '2019-09-23', '2012-11-26',
               ...
               '2019-09-26', '2017-02-23', '2019-02-28', '2016-03-03',
               '2019-04-26', '2008-05-30', '2015-03-31', '2017-07-25',
               '2018-03-13', '2012-06-06'],
              dtype='datetime64[ns]', name='Dates', length=260, freq=None)


  


In [241]:
dfs_health_public_income = transform('health_public_quarterly_income_statements_1.zip', ',')

In [243]:
dfs_health_public_income.set_index("Dates",inplace=True)

In [245]:
dfs_health_public_income.head(8)

Unnamed: 0_level_0,CompanyId,IQ_OTHER_UNUSUAL_SUPPL,IQ_NI_COMPANY,IQ_RESTRUCTURE,IQ_COST_REV,IQ_OTHER_OPER,IQ_RD_EXP_FN,IQ_DILUT_EPS_EXCL,IQ_EBITDA_EXCL_SBC,IQ_NI_NORM,...,IQ_TOTAL_UNUSUAL_SUPPLE,IQ_PERIODLENGTH_IS,IQ_EBT,IQ_EBITA,IQ_OTHER_NON_OPER_EXP_SUPPL,IQ_EBITA_EXCL_SBC,IQ_DILUT_EPS_NORM,IQ_RESTATEMENT_IS,IQ_TOTAL_REV_SHARE,IQ_EBITA_EQ_INC_EXCL_SBC
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-05-18,IQ24657663,,-17.015579,,0.672007,0.038442,,-15.630368,-5.222399,-3.468969,...,,3.0,-11.492489,-5.358695,,-5.358695,-3.186565,,0.096133,-5.358695
2010-08-20,IQ24657663,,-3.466,,1.923,0.007,,-2.864678,-3.12,-2.16625,...,,3.0,-3.466,-3.373,,-3.373,-1.790423,,0.100007,-3.373
2010-11-05,IQ24657663,,-2.874,,1.242,0.005,,-2.043156,-2.579,-1.79625,...,,3.0,-2.874,-2.599,,-2.599,-1.276972,,0.106636,-2.599
2011-04-01,IQ24657663,,-3.05,,1.473,0.01,,-2.199999,-2.75,-1.90625,...,,3.0,-3.05,-3.01,,-3.01,-1.355172,,0.194078,-3.01
2011-04-18,IQ24657663,,-8.168226,,0.225269,-0.022,,-4.048724,-2.896727,-2.018413,...,,3.0,-3.378746,-3.22327,,-3.22327,-1.000103,,0.289518,-3.22327
2011-08-26,IQ24657663,,-2.372,,1.197,0.003,,-1.0,-2.009,-1.48875,...,,3.0,-2.382,-2.336,,-2.336,-0.605432,,0.285483,-2.336
2011-11-10,IQ24657663,,-2.755,,1.611,0.042,,-1.5,-2.427,-1.728125,...,,3.0,-2.765,-2.739,,-2.739,-0.70278,,0.381865,-2.739
2012-03-01,IQ24657663,,-2.863,,2.115,0.007,,-0.7,-2.577,-1.79625,...,,3.0,-2.874,-2.86,,-2.86,-0.436811,,0.273333,-2.86


In [249]:
dfs_health_public_income.index[dfs_health_public_income.index.duplicated()].unique()

DatetimeIndex(['2018-05-15', '2019-08-28', '2019-04-17', '2015-06-08',
               '2015-09-30', '2016-06-23', '2016-09-30', '2017-08-31',
               '2018-05-10', '2018-08-10',
               ...
               '2015-03-04', '2016-03-03', '2017-04-30', '2017-11-24',
               '2010-08-17', '2011-03-02', '2019-02-21', '2015-02-16',
               '2015-05-30', '2016-10-19'],
              dtype='datetime64[ns]', name='Dates', length=1610, freq=None)

In [262]:
dfs_software_private_income = transform('software_private_annual_income_statements.zip', ',')


In [263]:
dfs_software_private_income.set_index("Dates",inplace=True)

In [264]:
dfs_software_private_income.head(4)

Unnamed: 0_level_0,CompanyId,IQ_OTHER_UNUSUAL_SUPPL,IQ_NI_COMPANY,IQ_RESTRUCTURE,IQ_COST_REV,IQ_OTHER_OPER,IQ_RD_EXP_FN,IQ_DILUT_EPS_EXCL,IQ_EBITDA_EXCL_SBC,IQ_NI_NORM,...,IQ_TOTAL_UNUSUAL_SUPPLE,IQ_PERIODLENGTH_IS,IQ_EBT,IQ_EBITA,IQ_OTHER_NON_OPER_EXP_SUPPL,IQ_EBITA_EXCL_SBC,IQ_DILUT_EPS_NORM,IQ_RESTATEMENT_IS,IQ_TOTAL_REV_SHARE,IQ_EBITA_EQ_INC_EXCL_SBC
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NaT,IQ145536370,,,,,2.253,,,,,...,,,-0.261,,0.001,,,,,
NaT,IQ145536370,,,,,2.126,,,,,...,,,1.007,,,,,,,
NaT,IQ145536370,,,,,4.054,,,,,...,,,1.167,,-0.001,,,,,
NaT,IQ145536370,,,,,5.63,,,,,...,,,0.521,,,,,,,
