## Using the Faker library to generating a realistic dataset

In [11]:
import random
import pandas as pd
from faker import Faker
from datetime import date

# Initialize Faker
fake = Faker()

# Define categories, payment modes
categories = ["Groceries", "Food", "Transportation", "Entertainment", "Shopping", "Medical Health", "Investment"]
payment_modes = ["Cash", "UPI", "Debit Card", "Credit Card", "Netbanking"]

# Define category-specific descriptions
category_descriptions = {
    "Groceries": ["Bought vegetables and fruits", "Grocery shopping at the local store", "Purchased daily essentials", "Bought fresh fruits and snacks"],
    "Food": ["Ordered food from swiggy", "Had lunch at hotel", "Dinner delivery from zomato", "Took out a meal from a restaurant", "Had dinner at a restaurant with friends"],
    "Transportation": ["Fuel refilled for the bike", "Paid for train ticket", "Uber ride to the office", "Paid for a cab ride to work", "Bought a metro ticket for travel"],
    "Entertainment": ["Movie tickets", "Concert tickets", "Gaming console purchase"],
    "Shopping": ["Bought clothes", "Shopping for accessories", "Bought household items", "Purchased electronics", "Bought a gift for a friend's birthday"],
    "Medical Health": ["Doctor's consultation fee", "Bought medication", "Medical insurance payment", "Health checkup cost"],
    "Investment": ["Invested in stocks", "Bought mutual fund units", "Investment in gold", "Purchased bonds"]
}

# Generate random monthly expense data
def gen_exp_data(month, year, num_entries=200):
    data = []
    start_date = date(year, month, 1)
    end_date = date(year, month, 31)
    for _ in range(num_entries):
        category = random.choice(categories)
        amount_paid = round(random.uniform(10, 1000), 2)
        description = random.choice(category_descriptions.get(category, []))
        payment_mode = random.choice(payment_modes)
        cashback = 0 if payment_mode in ["Cash", "Netbanking"] or random.random() > 0.2 else round(random.uniform(0, 10), 2)

        record = {
            "Date": fake.date_between(start_date=start_date, end_date=end_date),
            "Category": category,
            "Payment Mode": payment_mode,
            "Description": description,
            "Amount": amount_paid,
            "Cashback": cashback
        }
        data.append(record)
    return pd.DataFrame(data)

df = gen_exp_data(month=1, year=2024, num_entries=200)
df = df.sort_values("Date").reset_index(drop=True)
df

Unnamed: 0,Date,Category,Payment Mode,Description,Amount,Cashback
0,2024-01-01,Transportation,Debit Card,Bought a metro ticket for travel,615.22,0.00
1,2024-01-01,Transportation,Credit Card,Uber ride to the office,379.94,0.00
2,2024-01-01,Shopping,Credit Card,Shopping for accessories,749.55,8.92
3,2024-01-01,Medical Health,Credit Card,Doctor's consultation fee,219.14,0.00
4,2024-01-01,Investment,Credit Card,Investment in gold,777.19,0.00
...,...,...,...,...,...,...
195,2024-01-30,Investment,Credit Card,Bought mutual fund units,608.93,0.00
196,2024-01-30,Food,Debit Card,Had lunch at hotel,220.23,0.00
197,2024-01-30,Entertainment,Netbanking,Concert tickets,109.93,0.00
198,2024-01-30,Groceries,Cash,Purchased daily essentials,222.46,0.00


In [2]:
### Save as CSV

In [12]:
df.to_csv('january_2024_expenses.csv', index=False)

### Combining all CSV files

In [15]:
import pandas as pd
import os

# The directory containing the CSV files
directory = r"CSV_files_path"

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Combine all CSV files into a single DataFrame
combined_df = pd.concat([pd.read_csv(os.path.join(directory, file)) for file in csv_files])

# Sort the combined DataFrame and save as csv
sorted_df = combined_df.sort_values(by='Date')
sorted_df.to_csv("Expense_tracker.csv", index=False)

print("CSV files concatenated successfully!")

CSV files concatenated successfully!


## Connecting Python to MYSQL

In [2]:
import pymysql
con= pymysql.connect(
    host="localhost",
    user="root",
    password="USER_PASSWORD",
    autocommit=True
    )
print(con)

<pymysql.connections.Connection object at 0x000002FB24B29BE0>


In [9]:
### Create MYSQL Database

In [3]:
mycursor=con.cursor()
mycursor.execute("CREATE DATABASE EXPENSE_DB")
mycursor.execute("SHOW DATABASES")
for x in mycursor:
    print(x)

('expense_db',)
('information_schema',)
('mysql',)
('performance_schema',)
('sqlpython1',)
('sys',)


In [10]:
mycursor.execute("USE EXPENSE_DB")

0

In [10]:
### Create Table

In [14]:
mycursor.execute("CREATE TABLE Expenses (id INT AUTO_INCREMENT PRIMARY KEY, Date DATE, Category VARCHAR(255), Payment_Mode VARCHAR(255), Description TEXT, Amount DECIMAL(10, 2), Cashback DECIMAL(10, 2))")

0

In [11]:
### Read CSV File

In [45]:
import pandas as pd
df = pd.read_csv('Expense_tracker.csv')

In [12]:
### Insert columns and rows

In [46]:
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
connection = pymysql.connect(
    host="localhost",
    user="root",
    password="USER_PASSWORD",
    database="EXPENSE_DB"
    )
cursor = connection.cursor()

for index, row in df.iterrows():
    create_query = """
    INSERT INTO Expenses (Date, Category, Payment_Mode, Description, Amount, Cashback)
    VALUES (%s, %s, %s, %s, %s, %s)
    """
    values = (row['Date'], row['Category'], row['Payment_Mode'], row['Description'], row['Amount'], row['Cashback'])
    cursor.execute(create_query, values)
    connection.commit()