# Personal Project main notebook

In [1]:
# Ignoring warning messages from python
import warnings
warnings.filterwarnings('ignore')

# General use imports
import pandas as pd
import numpy as np

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# Modules and data
import requests
import acquire
import prep
from vega_datasets import data
from datetime import datetime

# Acquire

In [2]:
# Acquiring the data from the csv file and displaying first 5 rows

transactions = acquire.get_local_transactions()
transactions.head()

Unnamed: 0,Date,Description,Original Description,Amount,Transaction Type,Category,Account Name,Labels,Notes
0,6/29/2022,DEPOSIT@MOBILE,DEPOSIT@MOBILE,206.25,credit,Transfer,Checkarama,,
1,6/28/2022,"MATTHEWS VET SERVICES, P","MATTHEWS VET SERVICES, P",13.2,debit,Veterinary,CREDIT CARD,,
2,6/28/2022,STARBUCKS STORE 24306,STARBUCKS STORE 24306,5.57,debit,Coffee Shops,CREDIT CARD,,
3,6/28/2022,PAYPAL *HULU,PAYPAL *HULU,14.16,debit,Television,CREDIT CARD,,
4,6/28/2022,H-E-B #618,H-E-B #618,20.56,debit,Groceries,CREDIT CARD,,


# CLEAN & PREPARE

>## High level exploration before Cleaning and preparation

In [3]:
# Checking the number of rows and columns

transactions.shape

(7872, 9)

In [4]:
# Comparing the Description and the Original Description columns to see which is worth dropping

transactions[['Description', 'Original Description']]

Unnamed: 0,Description,Original Description
0,DEPOSIT@MOBILE,DEPOSIT@MOBILE
1,"MATTHEWS VET SERVICES, P","MATTHEWS VET SERVICES, P"
2,STARBUCKS STORE 24306,STARBUCKS STORE 24306
3,PAYPAL *HULU,PAYPAL *HULU
4,H-E-B #618,H-E-B #618
...,...,...
7867,Ultra Foods Qps,ULTRA FOODS #8761 QPS LOMBARD
7868,Pennys Noodle Shop,PENNY'S NOODLE SHOPQPS CHICAGO
7869,Amazon Music,MUSIC DWNLDS 866-216
7870,Calling Card,CALLINGCARDS/CONFERENC (866)29


In [5]:
# Checking the df nulls

transactions.isnull().sum()

Date                       0
Description                0
Original Description       0
Amount                     0
Transaction Type           0
Category                   3
Account Name               0
Labels                  7870
Notes                   7862
dtype: int64

In [6]:
# Checking the specific crows of the Category column

transactions[transactions['Category'].isnull()]

Unnamed: 0,Date,Description,Original Description,Amount,Transaction Type,Category,Account Name,Labels,Notes
5996,3/02/2015,Payment,Payment,10.0,credit,,Stafford Loans U.S. DEPARTMENT OF EDUCATION,,
5997,3/02/2015,Pending,Pending,10.0,credit,,Stafford Loans U.S. DEPARTMENT OF EDUCATION,,
5998,2/28/2015,Payment,Payment,40.0,credit,,Stafford Loans U.S. DEPARTMENT OF EDUCATION,,


>## Takeaways
    - The df contains duplicates
    - The df has nulls in great number in labels and Notes columns
    - The df has much fewer nulls in the Category column
    - The Description and the Original Description are duplicates
    - These columns contain special characters
    - There is a date column
>## Actions
    - Remove duplicates
    - Drop Labels, Notes, and Original Description
    - Remove special characters from Description
    - Rename columns for readability
    - Make Account Name and Description's content into lower case
    - Set Date as INDEX

>## Cleaning

In [7]:
# Printing the column names into a list without a line break

a = transactions.columns.tolist()
print(a,)

['Date', 'Description', 'Original Description', 'Amount', 'Transaction Type', 'Category', 'Account Name', 'Labels', 'Notes']


In [8]:
# Assigning the list of columns to a variable

columns = ['Date', 'Description', 'Original Description', 'Amount', 'Transaction Type', 'Category', 'Account Name', 'Labels', 'Notes']

In [9]:
# Displaying duplicate rows in count and percent

prep.multi_frequency(transactions, columns)

Unnamed: 0,num_rows_missing,pct_rows_missing
Date,0,0.0
Description,0,0.0
Original Description,0,0.0
Amount,0,0.0
Transaction Type,0,0.0
Category,3,0.03811
Account Name,0,0.0
Labels,7870,99.974593
Notes,7862,99.872967


In [10]:
# Dropping dupicates and keeping the entries detected and reassigning the df to a new variable

transactions1 = transactions.drop_duplicates()

In [11]:
transactions1.shape

(7797, 9)

In [12]:
# Dropping columns that are not useful

transactions2 = transactions1.drop(['Labels', 'Notes', 'Original Description'], axis = 1)

In [13]:
transactions2.head()

Unnamed: 0,Date,Description,Amount,Transaction Type,Category,Account Name
0,6/29/2022,DEPOSIT@MOBILE,206.25,credit,Transfer,Checkarama
1,6/28/2022,"MATTHEWS VET SERVICES, P",13.2,debit,Veterinary,CREDIT CARD
2,6/28/2022,STARBUCKS STORE 24306,5.57,debit,Coffee Shops,CREDIT CARD
3,6/28/2022,PAYPAL *HULU,14.16,debit,Television,CREDIT CARD
4,6/28/2022,H-E-B #618,20.56,debit,Groceries,CREDIT CARD


In [14]:
# Checking the nulls in the Category column

transactions2[transactions2['Category'].isnull()]

Unnamed: 0,Date,Description,Amount,Transaction Type,Category,Account Name
5996,3/02/2015,Payment,10.0,credit,,Stafford Loans U.S. DEPARTMENT OF EDUCATION
5997,3/02/2015,Pending,10.0,credit,,Stafford Loans U.S. DEPARTMENT OF EDUCATION
5998,2/28/2015,Payment,40.0,credit,,Stafford Loans U.S. DEPARTMENT OF EDUCATION


In [15]:
# Filling out the nulls with the category Education

transactions3 = prep.handle_missing_values(transactions2)

In [17]:
a = transactions3.columns.tolist()
print(a,)

['Date', 'Description', 'Amount', 'Transaction Type', 'Category', 'Account Name']


In [20]:
# Assigning columns to a variable

columns3 = ['Date', 'Description', 'Amount', 'Transaction Type', 'Category', 'Account Name']

In [21]:
# Checking nulls

prep.multi_frequency(transactions3, columns3)

Unnamed: 0,num_rows_missing,pct_rows_missing
Date,0,0.0
Description,0,0.0
Amount,0,0.0
Transaction Type,0,0.0
Category,0,0.0
Account Name,0,0.0
