# Machine Learning (revenue_prediction)

In [2]:
# Packages / libraries
import os
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error
from math import sqrt
import json
import datetime as dt

In [4]:
# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(5,5)})

# view all the dataframe
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

# remove warnings
import warnings
warnings.filterwarnings("ignore")

## Loading the Raw Data

In [5]:
# Loading the data

with open(r"C:\Users\user\Downloads\accounton_data.json\accounton_data.json",'r') as f:
    raw_data = json.loads(f.read())

In [6]:
# Flatten data
data = pd.json_normalize(raw_data)


# Data Preprocessing

In [7]:
# convert the CreationDate column into DateTime type Delet the companies the created after 2019 because it had many NaN values
data['creation_date'] = pd.to_datetime(data['creation_date']).dt.to_period('Y')

## Checking for the duplicated values

In [8]:
# checking the duplicated company
bool_series = data.duplicated()

##  Checking for NULL Values

In [9]:
# Checking for null values
data.isnull().sum()

company_name            0
nace_code               0
vat_number              0
zipcode                 0
city                    0
                    ...  
staff_costs.2019    31132
staff_costs.2018    31569
staff_costs.2016    32485
staff_costs.2017    33830
staff_costs.2015    23800
Length: 75, dtype: int64

In [62]:
# Save the rows tha have values .. deleting the rows that have all NAN values
df1 = data[data['ebit.2019'].notna()]

In [63]:
# Split the Nace_code to git the first 2 digits
df1['Nace_code'] = df1['nace_code'].str[0:2]

In [64]:
df1 = df1.drop(columns=['company_name', 'zipcode', 'city', 'creation_date', 'legal_form'])

In [65]:
company_cat = ['Large', 'Medium sized','Small','Very large']
province =  ['Antwerp','East-Flanders','Limburg','Vlaams Brabant','West-Flanders']
nace_code_list = list(df1['Nace_code'])

years = ['2015', '2016','2017', '2018','2019','2020']
Features = ['ebit', 'ebitda' , 'profit_and_loss_after_taxes' , 'total_assets' , 'total_liabilities' ,
    'operating_profit_and_loss' , 'financial_profit_and_loss' ,
    'staff_count' , 'net_added_value' , 'staff_costs','revenue']
    

In [66]:
# fill the NAN values in each feature based on the median values of the same category in same provine and activites

for each_category in company_cat:
    for each_province in province:
        median_df =df1.groupby(['company_category', 'province', 'Nace_code']).median()
        if each_category == "Large" and each_province == 'Antwerp':
            i = median_df.loc[each_category, each_province]
            nace_list = i.index
            nace_list = list(nace_list)
            for nc in nace_list:
                for feature in Features:
                    for year in years:
                        median = median_df.loc[(each_category, each_province, nc),f'{feature}.{year}']
                        df1[f'{feature}.{year}'].fillna(median, inplace=True)

In [67]:
df1['revenue.2019'].isna().sum()

0

# Reshape the data frame
### inorder to creat a new dataframe that appear the companies data based on year

In [68]:
# making the reshape 
# create a dataframe to save and arrange the data 
reshape_df = pd.DataFrame()
# creat the final complete data frame
full_df = pd.DataFrame()
# determine the years and the features
years = ['2015', '2016','2017', '2018','2019']
Features = ['ebit', 'ebitda' , 'profit_and_loss_after_taxes' , 'total_assets' , 'total_liabilities' ,
    'operating_profit_and_loss' , 'financial_profit_and_loss' ,
    'staff_count' , 'net_added_value' , 'staff_costs']
# Loop to access the data from the original dataframe and assigh it in the new dataframe
for y in years:
                
                # reshape_df['vat_number'] = df1['vat_number']
                reshape_df['vat_number']= df1['vat_number']
                reshape_df['company_category'] = df1['company_category']
                reshape_df['province'] = df1['province']
                reshape_df['nace_code'] = df1['nace_code']
                reshape_df['Nace_code'] = df1['Nace_code']
                reshape_df['Year'] = y
                
                for f in Features:
                    reshape_df[f"{f}"] = df1[f"{f}.{y}"]
                reshape_df[f"current_revenue"] = df1[f"revenue.{y}"]
                reshape_df[f"next_year_revenue"] = df1[f"revenue.{str(int(y)+1)}"]
                full_df = pd.concat([full_df,reshape_df], axis=0)


In [None]:
full_df.isna().sum()

In [None]:
full_df = full_df.sort_values(['vat_number','Year'])
full_df

In [72]:
full_df = full_df[['company_category', 'province', 'ebit', 'total_liabilities', 'net_added_value', 'staff_costs', 'current_revenue', 'next_year_revenue']]


In [None]:
full_df

In [74]:
# Converting the categorical data of (Company_category and province) into numerical 
df_category = pd.get_dummies(full_df['company_category'])
df_new = pd.concat([full_df, df_category], axis=1)


In [None]:
df_province = pd.get_dummies(full_df['province'])
df_new = pd.concat([df_new, df_province], axis=1)
df_new

## Selecting the important features

In [None]:
# .corr() is used to find the pairwise correlation of all columns in the dataframe. Any null values are automatically excluded
corr = full_df.corr()
corr

In [None]:
# select the features and create the final dataset
training_df= df_new.drop(columns=['company_category', 'province'])


In [78]:
training_df.isna().sum()

ebit                 0
total_liabilities    0
net_added_value      0
staff_costs          0
current_revenue      0
next_year_revenue    0
Large                0
Medium sized         0
Small                0
Very large           0
Antwerp              0
East-Flanders        0
Limburg              0
Vlaams Brabant       0
West-Flanders        0
dtype: int64