#### Revenue Prediction Project

In [None]:
# import necessary libraries
import numpy as np 
import pandas as pd
import json
import datetime as dt
import matplotlib.pyplot as plt
import warnings

In [None]:
# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# view all the dataframe
pd.set_option('display.max_columns', None)

# remove warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading the data
with open(r"accounton_data.json",'r') as f:
    raw_data = json.loads(f.read())

In [None]:
# Flatten json data
data = pd.json_normalize(raw_data)

In [None]:
# convert the CreationDate column into DateTime
data['creation_date'] = pd.to_datetime(data['creation_date']).dt.to_period('Y')

In [None]:
# Deselect data which has many null values
df1 = data[data['ebit.2019'].notna()]

In [None]:
# Split the Nace_code to get the first 2 digits
df1['Nace_code'] = df1['nace_code'].str[0:2]

#### Reshape the dataset

In [None]:
# create a dataframe to save and arrange the data 
reshape_df = pd.DataFrame()
# create the final complete data frame
full_df = pd.DataFrame()
# determine the years and the features
years = ['2015', '2016','2017', '2018','2019']
Features = ['ebit', 'ebitda' , 'profit_and_loss_after_taxes' , 'total_assets' , 'total_liabilities' ,
            'operating_profit_and_loss' , 'financial_profit_and_loss' ,
            'staff_count' , 'net_added_value' , 'staff_costs']

# Loop to access the data from the original dataframe and assign it in the new dataframe
for y in years:
    reshape_df['vat_number']= df1['vat_number']
    reshape_df['company_category'] = df1['company_category']
    reshape_df['province'] = df1['province']
    reshape_df['nace_code'] = df1['nace_code']
    reshape_df['Nace_code'] = df1['Nace_code']
    reshape_df['Year'] = y
    reshape_df[f"current_revenue"] = df1[f"revenue.{y}"]
    
    for f in Features:
        reshape_df[f"{f}"] = df1[f"{f}.{y}"]
    
    # add the next year revenue column based on the current revenue
    reshape_df[f"next_year_revenue"] = df1[f"revenue.{str(int(y)+1)}"]
    
    # join the dataframe to create full dataframe
    full_df = pd.concat([full_df,reshape_df], axis=0)

In [None]:
# Remove the rows where there is no current revenue value
full_df_after_drop = full_df.dropna(subset=['current_revenue'])

# Remove the rows where there next_year_revenue column is null
full_df_after_drop = full_df_after_drop.dropna(subset=['next_year_revenue'])

In [None]:
# The correction matrix to find the correlation between the features
corr_matrix = full_df_after_drop.corr()
corr_matrix

#### Data Visualization

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(x=full_df_after_drop['ebit'], y=full_df_after_drop['next_year_revenue'])
plt.scatter(full_df_after_drop['ebitda'], full_df_after_drop['next_year_revenue'], color='red')

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(full_df_after_drop['profit_and_loss_after_taxes'], full_df_after_drop['next_year_revenue'],)
plt.scatter(full_df_after_drop['operating_profit_and_loss'], full_df_after_drop['next_year_revenue'], color='green')

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(full_df_after_drop['staff_count'], full_df_after_drop['next_year_revenue'])
plt.scatter(full_df_after_drop['staff_costs'], full_df_after_drop['next_year_revenue'], color='green')

In [None]:
plt.figure(figsize=(10,10))
print('corr', full_df_after_drop['staff_count']. corr(full_df_after_drop['staff_costs']))
plt.scatter(full_df_after_drop['staff_count'], full_df_after_drop['staff_costs'])

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(full_df_after_drop['financial_profit_and_loss'], full_df_after_drop['next_year_revenue'])

In [None]:
plt.figure(figsize=(10,10))
print('corr', full_df_after_drop['total_liabilities']. corr(full_df_after_drop['total_assets']))
plt.scatter(full_df_after_drop['total_assets'], full_df_after_drop['next_year_revenue'])
plt.scatter(full_df_after_drop['total_liabilities'], full_df_after_drop['next_year_revenue'], color='green')

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(full_df_after_drop['net_added_value'], full_df_after_drop['next_year_revenue'])

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(full_df_after_drop['current_revenue'], full_df_after_drop['next_year_revenue'])

According to the values of the correlation matrix and the data values we choose `ebit`, `total_liabilities`, `net_added_value`, `staff_costs`, `current_revenue` numerical columns as numerical features. And `company_category` and `Province` as categorical features

In [None]:
# Selected Numerical features to fill in the null values
Features = ['ebit', 'total_liabilities' , 'net_added_value' , 'staff_costs']

In [None]:
company_cat = full_df_after_drop['company_category'].unique()
province = full_df_after_drop['province'].unique()
years = full_df_after_drop['Year'].unique()
nace_code_list = list(full_df_after_drop['Nace_code'])

In [None]:
# fill the NAN values in each feature based on the median values of the same company_category/province/nace_code
median_df =full_df_after_drop.groupby(['company_category', 'province', 'Nace_code']).median()
for each_category in company_cat:
    for each_province in province:
        i = median_df.loc[each_category, each_province]
        nace_list = i.index
        nace_list = list(nace_list)
        for nc in nace_list:
            for feature in Features:
                median = median_df.loc[(each_category, each_province, nc),f'{feature}']
                full_df_after_drop[f'{feature}'].fillna(median, inplace=True)

In [None]:
# fill the staff_cost(if 0) with the median of the staff_cost if staff_count is null 

median_df =full_df_after_drop.groupby(['company_category', 'province', 'Nace_code']).median()

for each_category in company_cat:
    for each_province in province:
        i = median_df.loc[each_category, each_province]
        nace_list = i.index
        nace_list = list(nace_list)
        for nc in nace_list:
            median = median_df.loc[(each_category, each_province, nc), 'staff_costs']
            full_df_after_drop.loc[(full_df_after_drop['company_category'] == each_category) &
                                   (full_df_after_drop['province'] == each_province) &
                                   (full_df_after_drop['Nace_code'] == nc) &
                                   (full_df_after_drop['staff_count'].isna()) & 
                                   (full_df_after_drop['staff_costs']==0.0), 'staff_costs'] = median



In [None]:
# Selecting the full features (categorical and numerical)
full_df_after_drop = full_df_after_drop[['company_category', 'province', 'ebit', 'total_liabilities', 
                                         'net_added_value', 'staff_costs', 'current_revenue', 'next_year_revenue']]

In [None]:
# Converting the categorical data of (Company_category and province) into numerical 
df_category = pd.get_dummies(full_df_after_drop['company_category'])
df_cat = pd.concat([full_df_after_drop, df_category], axis=1)
df_province = pd.get_dummies(full_df_after_drop['province'])

In [None]:
# join the dataframe  to make the complete dataframe
df_training = pd.concat([df_cat, df_province], axis=1)

In [None]:
# drop the unnecessary columns
df_training= df_training.drop(columns=['company_category', 'province'])

In [None]:
# save the dataset to csv
df_training.to_csv('clean_accounton.csv')