# 1. Business Understanding
- Help in forecasting accounting transactions
- Target column: Amount
- Most likely, I'll use regression because of the continous nature of the target data
- Data goes back for 3 years
- Data quality is uncertain

# 2. Data Exploration

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import warnings

warnings.filterwarnings("ignore")

# Set the environment
np.random.seed(1234)
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
# Load CSV file
file_path = 'C:/Users/User/Downloads/Python/'
file_name = 'regression.csv'

df = pd.read_csv(file_path+file_name, header=0, delimiter=',', encoding='1253')

In [None]:
# Show first 5 rows of data
df.head()

In [None]:
# Show last 5 rows of data
df.tail()

In [None]:
df.info()

In [None]:
# Show summary statistics
df.describe(include = 'all')

In [None]:
# Check data types
df.dtypes

In [None]:
# Remove empty rows/columns
df.dropna(how="all", axis=0, inplace=True) # Rows
df.dropna(how="all", axis=1, inplace=True) # Columns

In [None]:
df.shape

In [None]:
# Remove duplicate rows
df.drop_duplicates(keep='first', inplace=True)

In [None]:
df.shape

In [None]:
# Remove leading and trailing spaces from 'object' (string) columns
for col in df.select_dtypes(['object']).columns:
    df[col] = df[col].str.strip()

In [None]:
# Remove unwanted characters (<, %, ?) from all rows
df = df.replace(r'[<%?]', '', regex=True)

In [None]:
# Show unique values for columns
for col in df.columns:
    print(col, len(df[col].unique()), df[col].unique())

In [None]:
# Check for % missing values
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df

In [None]:
# Amount range analysis by Account Type
plt.figure(figsize=(20,6))
sns.boxplot(x='Account Type', y='Amount', data=df).set_title('Account Type BoxPlot')
plt.show()

In [None]:
# Let's explore further the Liability Accont Type because it has the biggest dispersion
plt.figure(figsize=(20,6))
sns.boxplot(x='Account', y='Amount', data=df[df['Account Type']=='Liability']).set_title('Liability BoxPlot')
plt.show()

In [None]:
# Let's do the same for Revenue
plt.figure(figsize=(20,6))
sns.boxplot(x='Account Description', y='Amount', data=df[df['Account Type']=='Revenue']).set_title('Revenue BoxPlot')
plt.show()

## Review Amount trends

In [None]:
# For trend examination, we need a create a Date column
monthmap = {
    'Jan':1,
    'Feb':2,
    'Mar':3,
    'Apr':4,
    'May':5,
    'Jun':6,
    'Jul':7,
    'Aug':8,
    'Sep':9,
    'Oct':10,
    'Nov':11,
    'Dec':12,
}

df['Period'] = df['Month'].apply(lambda x: monthmap[x])
df['Day'] = 1
df['Date'] = df['Year'].astype(str) + '-' + df['Period'].astype(str) + '-' + df['Day'].astype(str)
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.dtypes

In [None]:
# Visualize Revenue trends
plt.figure(figsize=(20,6))
sns.lineplot(x='Date', y='Amount', hue='Account Description', estimator=None, data=df[df['Account Type']=='Revenue']).set_title('Seasonal Revenue')
plt.show()

In [None]:
# Due to the messiness of the previous plot, I will concentrate on Product Sales
plt.figure(figsize=(20,6))
sns.lineplot(x='Date', y='Amount', hue='Account Description', estimator=None, data=df[df['Account Description']=='Product Sales']).set_title('Seasonal Revenue')
plt.show()

In [None]:
# Let's check another source of Revenue
plt.figure(figsize=(20,6))
sns.lineplot(x='Date', y='Amount', hue='Account Description', estimator=None, data=df[df['Account Description']=='Service Revenue']).set_title('Seasonal Revenue')
plt.show()

## Correlation Analysis

In [None]:
# Explore the relationship between different accounts (Unique rows in Account column).
# In order to do that, we need to reshape our dataframe, so each Account will have each own column.
pd.get_dummies(df['Account'])

# Store the transformations to a dictionary
corrdict = {}
for key, row in df.join(pd.get_dummies(df['Account'])).iterrows():
    corrdict[key] = {int(row['Account']):row['Amount']}

# Transform the dictionary to dataframe
corrdf = pd.DataFrame.from_dict(corrdict).T.fillna(0)

# Calculate the correlations
corrdf.corr()

# Plot a heatmap
plt.figure(figsize=(20,6))
sns.heatmap(corrdf.corr(), annot = True)
plt.show()

In [None]:
# Sanity check on highly correlated Accounts
df[df['Account']==3000000]

# 3. Data Preperation

In [None]:
# Visualize each account seperately
for account in df['Account'].unique():
    plt.figure(figsize=(20,6))
    sns.lineplot(x='Date', y='Amount', estimator=np.median, hue='Account Description', data=df[df['Account']==account]).set_title('{} by Month'.format(account))
    plt.show()

In [None]:
# Filter out Inventory rows
df = df[df['Account'] != 3000001]

In [None]:
df['Account'].unique()

## Convert fields to correct data type

In [None]:
df.dtypes

In [None]:
# Convert Year, Account to 'object' data type
df['Year'] = df['Year'].astype(str)
df['Account'] = 'ACC' + df['Account'].astype(str)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
# Drop columns Period, Day, Date because the information is avaliable in Year, Month columns
df.drop(['Period', 'Day', 'Date'], axis='columns', inplace=True)

In [None]:
df.dtypes

In [None]:
len(df['Account'].unique())

In [None]:
len(df['Account Description'].unique())

In [None]:
df['AccountVal'] = df['Account'] + df['Account Description']

In [None]:
df.head()

In [None]:
# Drop columns AccountVal, Account Description because the information is avaliable in Account
df.drop(['Account Description', 'AccountVal'], axis='columns', inplace=True)

In [None]:
df.head()

In [None]:
# Perform One-Hot Encoding to prepare the data for ML
df_model = pd.get_dummies(df)

In [None]:
df_model.head()

In [None]:
df_model.dtypes

# 4. Modeling

In [None]:
# Declare the independent variables (X) and the dependent variable(y)
X = df_model.drop('Amount', axis=1)
y = df_model['Amount']

In [None]:
X.columns

In [None]:
y

In [None]:
# Generate the train and test partitions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Use different Scikit regressors

In [None]:
# RandomForestRegressor
rf = RandomForestRegressor(random_state=1234).fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# GradientBoostingRegressor
gb = GradientBoostingRegressor(random_state=1234).fit(X_train, y_train)
gb_pred = gb.predict(X_test)

# RidgeRegressor
rdg = Ridge(random_state=1234).fit(X_train, y_train)
rdg__pred = rdg.predict(X_test)

# Lasso
ls = Lasso(random_state=1234).fit(X_train, y_train)
ls__pred = ls.predict(X_test)

# ElasticNet
enet = ElasticNet(random_state=1234).fit(X_train, y_train)
enet__pred = enet.predict(X_test)

# 5. Evaluation

In [None]:
# Generate our evaluation metrics to find the best model
print("rf Accuracy: "+ str(rf.score(X,y)))
print("gb Accuracy: "+ str(gb.score(X,y)))
print("rdg Accuracy: "+ str(rdg.score(X,y)))
print("ls Accuracy: "+ str(ls.score(X,y)))
print("enet Accuracy: "+ str(enet.score(X,y)))

In [None]:
# Our best model is 'rf'. 
rf_pred_new = rf.predict(X)             # Predicts Amount for the whole dataset       

In [None]:
# Add a new column in the original dataset with the predictions
df['Amount_Pred'] = rf_pred_new

In [None]:
df

# The End